From d3caacd636684199383cec55f50f3ead255c8e79 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Tue, 13 Aug 2024 09:23:02 -0700 Subject: [PATCH 01/16] fix: reject reconnecting agents with different resource pool configuration --- agent/internal/agent.go | 2 + master/internal/rm/agentrm/agent.go | 13 +++++ master/internal/rm/agentrm/agent_state.go | 14 ++++++ .../internal/rm/agentrm/agent_state_test.go | 50 +++++++++++++++++++ master/pkg/aproto/master_message.go | 1 + 5 files changed, 80 insertions(+) diff --git a/agent/internal/agent.go b/agent/internal/agent.go index 80693407d32..e30003f92bf 100644 --- a/agent/internal/agent.go +++ b/agent/internal/agent.go @@ -160,6 +160,7 @@ func (a *Agent) run(ctx context.Context) error { Version: a.version, Devices: devices, ContainersReattached: reattached, + ResourcePoolName: a.opts.ResourcePool, }}: case <-ctx.Done(): return ctx.Err() @@ -351,6 +352,7 @@ func (a *Agent) reconnectFlow( Version: a.version, Devices: devices, ContainersReattached: reattached, + ResourcePoolName: a.opts.ResourcePool, }}: case <-ctx.Done(): return nil, nil, ctx.Err() diff --git a/master/internal/rm/agentrm/agent.go b/master/internal/rm/agentrm/agent.go index a5465898910..7c762565e05 100644 --- a/master/internal/rm/agentrm/agent.go +++ b/master/internal/rm/agentrm/agent.go @@ -620,6 +620,19 @@ func (a *agent) HandleIncomingWebsocketMessage(msg *aproto.MasterMessage) { a.stop(err) return } + resourcePoolErr := a.agentState.checkAgentResourcePoolMatch(msg.AgentStarted) + if resourcePoolErr != nil { + a.syslog.WithError(resourcePoolErr). + Error("change in agent resource pool was detected") + a.socket.Outbox <- aproto.AgentMessage{ + AgentShutdown: &aproto.AgentShutdown{ + ErrMsg: aproto.ErrAgentMustReconnect.Error(), + }, + } + + a.stop(resourcePoolErr) + return + } } else { a.agentStarted(msg.AgentStarted) } diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go index 0a5941173db..fd5e43a7964 100644 --- a/master/internal/rm/agentrm/agent_state.go +++ b/master/internal/rm/agentrm/agent_state.go @@ -289,6 +289,20 @@ func (a *agentState) checkAgentStartedDevicesMatch( return nil } +func (a *agentState) checkAgentResourcePoolMatch( + agentStarted *aproto.AgentStarted, +) error { + // We need to compare the resource pool configurations between the agent and the master. + // Ideally, the master doesn't request new resource pool information if the agent reconnects + // within the designated reconnection period, while the agent should read from its updated configuration. + // If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart. + if a.resourcePoolName != agentStarted.ResourcePoolName { + return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName) + } + + return nil +} + func (a *agentState) containerStateChanged(msg aproto.ContainerStateChanged) { for _, d := range msg.Container.Devices { s, ok := a.slotStates[d.ID] diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go index 3341f83f9d0..56a9993bd33 100644 --- a/master/internal/rm/agentrm/agent_state_test.go +++ b/master/internal/rm/agentrm/agent_state_test.go @@ -70,6 +70,7 @@ func TestAgentStatePersistence(t *testing.T) { Version: "", Devices: devices, ContainersReattached: []aproto.ContainerReattachAck{}, + ResourcePoolName: "", } state.agentStarted(started) require.Len(t, state.getSlotsSummary("/myagent"), 2) @@ -322,6 +323,54 @@ func Test_agentState_checkAgentStartedDevicesMatch(t *testing.T) { } } +func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { + tests := []struct { + name string + state agentState + agentStarted *aproto.AgentStarted + wantErrContains string + }{ + { + name: "resource pool name match", + state: agentState{ + resourcePoolName: "pool1", + }, + agentStarted: &aproto.AgentStarted{ + ResourcePoolName: "pool1", + }, + wantErrContains: "", + }, + { + name: "resource pool name is missing", + state: agentState{ + resourcePoolName: "pool1", + }, + agentStarted: &aproto.AgentStarted{ResourcePoolName: ""}, + wantErrContains: "resource pool has changed", + }, + { + name: "mismatched resource pool name", + state: agentState{ + resourcePoolName: "pool1", + }, + agentStarted: &aproto.AgentStarted{ + ResourcePoolName: "pool2", + }, + wantErrContains: "resource pool has changed", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.state.checkAgentResourcePoolMatch(tt.agentStarted) + if tt.wantErrContains == "" { + require.NoError(t, err) + return + } + require.ErrorContains(t, err, tt.wantErrContains) + }) + } +} + func TestSlotStates(t *testing.T) { rpName := "test" state := newAgentState(aproto.ID(uuid.NewString()), 64) @@ -345,6 +394,7 @@ func TestSlotStates(t *testing.T) { Version: "", Devices: devices, ContainersReattached: []aproto.ContainerReattachAck{}, + ResourcePoolName: "", } state.agentStarted(started) slots := state.getSlotsSummary("/") diff --git a/master/pkg/aproto/master_message.go b/master/pkg/aproto/master_message.go index edbefb08b1c..ea2385377b6 100644 --- a/master/pkg/aproto/master_message.go +++ b/master/pkg/aproto/master_message.go @@ -80,6 +80,7 @@ type AgentStarted struct { Version string Devices []device.Device ContainersReattached []ContainerReattachAck + ResourcePoolName string } // ContainerStateChanged notifies the master that the agent transitioned the container state. From b4a24f79ebe5b461d512d7dced83b0d54d3c8851 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Tue, 13 Aug 2024 22:47:38 -0700 Subject: [PATCH 02/16] add e2e test --- .../double-reattach.devcluster.yaml | 16 +++++++++++++ .../tests/cluster/test_master_restart.py | 23 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml index a7111202fa8..7333c64ff57 100644 --- a/.circleci/devcluster/double-reattach.devcluster.yaml +++ b/.circleci/devcluster/double-reattach.devcluster.yaml @@ -43,6 +43,10 @@ stages: pool_name: default provider: null task_container_defaults: null + - pool_name: pool1 + max_slots: 8 + scheduler: + type: priority scim: enabled: true auth: @@ -93,3 +97,15 @@ stages: agent_reconnect_backoff: 5 container_auto_remove_disabled: true artificial_slots: 4 + + - agent: + name: agent20 # Copy of agent1, but with different resource pool. + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent1 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true + resource_pool: pool1 \ No newline at end of file diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index cc20d4931e4..039c2a21bf6 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -710,3 +710,26 @@ def test_master_restart_with_queued( for cmd_id in [running_command_id, queued_command_id]: utils.wait_for_command_state(sess, cmd_id, "TERMINATED", 90) utils.assert_command_succeeded(sess, cmd_id) + + +@pytest.mark.managed_devcluster +def test_agent_resource_pool_change( + restartable_managed_cluster: managed_cluster.ManagedCluster, +) -> None: + admin = api_utils.admin_session() + try: + restartable_managed_cluster.kill_agent() + restartable_managed_cluster.dc.restart_stage("agent20") + + for _i in range(5): + agent_data = managed_cluster.get_agent_data(admin) + if len(agent_data) == 0: + # Agent has exploded and been wiped due to resource pool mismatch, as expected. + break + else: + pytest.fail( + f"agent with different resource pool is still present after {_i} ticks:{agent_data}" + ) + finally: + restartable_managed_cluster.dc.kill_stage("agent20") + restartable_managed_cluster.restart_agent() From 2a4d4f2d1b52df7be70d12a01f40dd0ef133d931 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 14 Aug 2024 15:01:38 -0700 Subject: [PATCH 03/16] add extra test cases --- .../double-reattach.devcluster.yaml | 13 +++++++++- .../tests/cluster/test_master_restart.py | 20 ++++++++++++++ master/internal/rm/agentrm/agent_state.go | 16 ++++++++---- .../internal/rm/agentrm/agent_state_test.go | 26 +++++++++++++++++++ .../rm/agentrm/resource_managers_test.go | 2 -- 5 files changed, 69 insertions(+), 8 deletions(-) diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml index 7333c64ff57..59026d25f58 100644 --- a/.circleci/devcluster/double-reattach.devcluster.yaml +++ b/.circleci/devcluster/double-reattach.devcluster.yaml @@ -108,4 +108,15 @@ stages: agent_reconnect_attempts: 24 agent_reconnect_backoff: 5 container_auto_remove_disabled: true - resource_pool: pool1 \ No newline at end of file + resource_pool: pool1 + + - agent: + name: agent30 # Copy of agent1, but with empty(default) resource pool. + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent1 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true \ No newline at end of file diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 1bd9fb6423e..294f04fd41b 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -779,3 +779,23 @@ def test_agent_resource_pool_change( finally: restartable_managed_cluster.dc.kill_stage("agent20") restartable_managed_cluster.restart_agent() + + +@pytest.mark.managed_devcluster +def test_agent_resource_pool_unchanged( + restartable_managed_cluster: managed_cluster.ManagedCluster, +) -> None: + admin = api_utils.admin_session() + try: + restartable_managed_cluster.kill_agent() + restartable_managed_cluster.dc.restart_stage("agent30") + + for _i in range(5): + agent_data = managed_cluster.get_agent_data(admin) + if len(agent_data) == 0: + # Agent has exploded and been wiped due to resource pool mismatch, + # which is not expected. + pytest.fail("agent exploded even with the same resource pool") + finally: + restartable_managed_cluster.dc.kill_stage("agent30") + restartable_managed_cluster.restart_agent() diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go index fd5e43a7964..da45dbbf383 100644 --- a/master/internal/rm/agentrm/agent_state.go +++ b/master/internal/rm/agentrm/agent_state.go @@ -20,6 +20,8 @@ import ( "github.com/determined-ai/determined/master/pkg/model" ) +const defaultResourcePoolName = "default" + type slotEnabled struct { deviceAdded bool agentEnabled bool @@ -289,14 +291,18 @@ func (a *agentState) checkAgentStartedDevicesMatch( return nil } +// We need to compare the resource pool configurations between the agent and the master. +// Ideally, the master doesn't request new resource pool information if the agent reconnects +// within the designated reconnection period, while the agent should read from its updated configuration. +// If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart. func (a *agentState) checkAgentResourcePoolMatch( agentStarted *aproto.AgentStarted, ) error { - // We need to compare the resource pool configurations between the agent and the master. - // Ideally, the master doesn't request new resource pool information if the agent reconnects - // within the designated reconnection period, while the agent should read from its updated configuration. - // If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart. - if a.resourcePoolName != agentStarted.ResourcePoolName { + // If the agent's resource pool is empty in the configuration and the master has it set to default, + // the agent should not be restarted. However, if the agent's resource pool differs from the master's record, + // the agent should be restarted. + if !(agentStarted.ResourcePoolName == "" && a.resourcePoolName == defaultResourcePoolName) && + (a.resourcePoolName != agentStarted.ResourcePoolName) { return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName) } diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go index 56a9993bd33..79a0a9c0a2a 100644 --- a/master/internal/rm/agentrm/agent_state_test.go +++ b/master/internal/rm/agentrm/agent_state_test.go @@ -340,6 +340,16 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { }, wantErrContains: "", }, + { + name: "resource pool name match", + state: agentState{ + resourcePoolName: "default", + }, + agentStarted: &aproto.AgentStarted{ + ResourcePoolName: "", + }, + wantErrContains: "", + }, { name: "resource pool name is missing", state: agentState{ @@ -348,6 +358,22 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { agentStarted: &aproto.AgentStarted{ResourcePoolName: ""}, wantErrContains: "resource pool has changed", }, + { + name: "resource pool name is missing", + state: agentState{ + resourcePoolName: "default", + }, + agentStarted: &aproto.AgentStarted{ResourcePoolName: "pool1"}, + wantErrContains: "resource pool has changed", + }, + { + name: "resource pool name is missing", + state: agentState{ + resourcePoolName: "", + }, + agentStarted: &aproto.AgentStarted{ResourcePoolName: "pool1"}, + wantErrContains: "resource pool has changed", + }, { name: "mismatched resource pool name", state: agentState{ diff --git a/master/internal/rm/agentrm/resource_managers_test.go b/master/internal/rm/agentrm/resource_managers_test.go index dea2af873ca..0cd82575b05 100644 --- a/master/internal/rm/agentrm/resource_managers_test.go +++ b/master/internal/rm/agentrm/resource_managers_test.go @@ -15,8 +15,6 @@ import ( "github.com/determined-ai/determined/master/internal/user" ) -const defaultResourcePoolName = "default" - func TestResourceManagerForwardMessage(t *testing.T) { user.InitService(nil, nil) conf := &config.ResourceConfig{ From 8076a1b062b071bf81c3a69c149141acf539a60c Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 14 Aug 2024 16:48:11 -0700 Subject: [PATCH 04/16] defaulted resource pool in options --- agent/internal/options/options.go | 1 + master/internal/rm/agentrm/agent_state.go | 6 +---- .../internal/rm/agentrm/agent_state_test.go | 26 +++---------------- 3 files changed, 6 insertions(+), 27 deletions(-) diff --git a/agent/internal/options/options.go b/agent/internal/options/options.go index 74d72b9e426..02120f639ae 100644 --- a/agent/internal/options/options.go +++ b/agent/internal/options/options.go @@ -28,6 +28,7 @@ func DefaultOptions() *Options { Level: "trace", Color: true, }, + ResourcePool: "default", SlotType: "auto", VisibleGPUs: VisibleGPUsFromEnvironment(), BindIP: "0.0.0.0", diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go index da45dbbf383..6b4d6a4cd14 100644 --- a/master/internal/rm/agentrm/agent_state.go +++ b/master/internal/rm/agentrm/agent_state.go @@ -298,11 +298,7 @@ func (a *agentState) checkAgentStartedDevicesMatch( func (a *agentState) checkAgentResourcePoolMatch( agentStarted *aproto.AgentStarted, ) error { - // If the agent's resource pool is empty in the configuration and the master has it set to default, - // the agent should not be restarted. However, if the agent's resource pool differs from the master's record, - // the agent should be restarted. - if !(agentStarted.ResourcePoolName == "" && a.resourcePoolName == defaultResourcePoolName) && - (a.resourcePoolName != agentStarted.ResourcePoolName) { + if a.resourcePoolName != agentStarted.ResourcePoolName { return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName) } diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go index 79a0a9c0a2a..921c876c337 100644 --- a/master/internal/rm/agentrm/agent_state_test.go +++ b/master/internal/rm/agentrm/agent_state_test.go @@ -70,7 +70,7 @@ func TestAgentStatePersistence(t *testing.T) { Version: "", Devices: devices, ContainersReattached: []aproto.ContainerReattachAck{}, - ResourcePoolName: "", + ResourcePoolName: defaultResourcePoolName, } state.agentStarted(started) require.Len(t, state.getSlotsSummary("/myagent"), 2) @@ -340,36 +340,18 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { }, wantErrContains: "", }, - { - name: "resource pool name match", - state: agentState{ - resourcePoolName: "default", - }, - agentStarted: &aproto.AgentStarted{ - ResourcePoolName: "", - }, - wantErrContains: "", - }, { name: "resource pool name is missing", state: agentState{ resourcePoolName: "pool1", }, - agentStarted: &aproto.AgentStarted{ResourcePoolName: ""}, - wantErrContains: "resource pool has changed", - }, - { - name: "resource pool name is missing", - state: agentState{ - resourcePoolName: "default", - }, - agentStarted: &aproto.AgentStarted{ResourcePoolName: "pool1"}, + agentStarted: &aproto.AgentStarted{ResourcePoolName: defaultResourcePoolName}, wantErrContains: "resource pool has changed", }, { name: "resource pool name is missing", state: agentState{ - resourcePoolName: "", + resourcePoolName: defaultResourcePoolName, }, agentStarted: &aproto.AgentStarted{ResourcePoolName: "pool1"}, wantErrContains: "resource pool has changed", @@ -420,7 +402,7 @@ func TestSlotStates(t *testing.T) { Version: "", Devices: devices, ContainersReattached: []aproto.ContainerReattachAck{}, - ResourcePoolName: "", + ResourcePoolName: defaultResourcePoolName, } state.agentStarted(started) slots := state.getSlotsSummary("/") From cbf68842dd26c6a60295cfe47b4b8e7beeda5b57 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Sun, 18 Aug 2024 22:33:53 -0700 Subject: [PATCH 05/16] correct e2e test with new fixture --- .../double-reattach.devcluster.yaml | 29 +---- .../multi-resource-pools.devcluster.yaml | 110 ++++++++++++++++++ agent/cmd/determined-agent/run_test.go | 4 + agent/internal/options/options_test.go | 1 + e2e_tests/tests/cluster/conftest.py | 2 + e2e_tests/tests/cluster/managed_cluster.py | 29 ++++- .../tests/cluster/test_master_restart.py | 20 ++-- 7 files changed, 156 insertions(+), 39 deletions(-) create mode 100644 .circleci/devcluster/multi-resource-pools.devcluster.yaml diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml index 59026d25f58..4aebab4c76b 100644 --- a/.circleci/devcluster/double-reattach.devcluster.yaml +++ b/.circleci/devcluster/double-reattach.devcluster.yaml @@ -43,10 +43,6 @@ stages: pool_name: default provider: null task_container_defaults: null - - pool_name: pool1 - max_slots: 8 - scheduler: - type: priority scim: enabled: true auth: @@ -96,27 +92,4 @@ stages: agent_reconnect_attempts: 24 agent_reconnect_backoff: 5 container_auto_remove_disabled: true - artificial_slots: 4 - - - agent: - name: agent20 # Copy of agent1, but with different resource pool. - config_file: - master_host: 127.0.0.1 - master_port: 8081 - agent_id: agent1 - container_master_host: $DOCKER_LOCALHOST - agent_reconnect_attempts: 24 - agent_reconnect_backoff: 5 - container_auto_remove_disabled: true - resource_pool: pool1 - - - agent: - name: agent30 # Copy of agent1, but with empty(default) resource pool. - config_file: - master_host: 127.0.0.1 - master_port: 8081 - agent_id: agent1 - container_master_host: $DOCKER_LOCALHOST - agent_reconnect_attempts: 24 - agent_reconnect_backoff: 5 - container_auto_remove_disabled: true \ No newline at end of file + artificial_slots: 4 \ No newline at end of file diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml new file mode 100644 index 00000000000..54986a82896 --- /dev/null +++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml @@ -0,0 +1,110 @@ +stages: + - db: + name: db + + - master: + pre: + - sh: make -C tools prep-root + config_file: + security: + initial_user_password: $INITIAL_USER_PASSWORD + db: + host: localhost + port: 5432 + password: postgres + user: postgres + name: determined + __internal: + preemption_timeout: 60s + checkpoint_storage: + type: shared_fs + host_path: /tmp + storage_path: determined-cp + log: + level: debug + root: tools/build + cache: + cache_dir: /tmp/determined-cache + launch_error: false + telemetry: + enabled: false + resource_manager: + default_aux_resource_pool: default + default_compute_resource_pool: default + scheduler: + fitting_policy: best + type: fair_share + type: agent + resource_pools: + - agent_reattach_enabled: true + agent_reconnect_wait: 25s + description: '' + max_aux_containers_per_agent: 100 + pool_name: default + provider: null + task_container_defaults: null + - pool_name: pool1 + max_slots: 8 + scheduler: + type: priority + scim: + enabled: true + auth: + type: basic + username: determined + password: password + + - custom: + name: proxy + cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"] + post: + - conncheck: + port: 8081 + + - agent: + name: agent1 + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent1 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true + hooks: + on_connection_lost: ["touch", "/tmp/agent1-connection-lost"] + + + - agent: + name: agent2 + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent2 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true + + - agent: + name: agent10 # Copy of agent1, but with different resource pool. + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent1 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true + resource_pool: pool1 + + - agent: + name: agent20 # Copy of agent1, but with empty(default) resource pool. + config_file: + master_host: 127.0.0.1 + master_port: 8081 + agent_id: agent1 + container_master_host: $DOCKER_LOCALHOST + agent_reconnect_attempts: 24 + agent_reconnect_backoff: 5 + container_auto_remove_disabled: true \ No newline at end of file diff --git a/agent/cmd/determined-agent/run_test.go b/agent/cmd/determined-agent/run_test.go index 593d749a4c5..d9ebda1ad4b 100644 --- a/agent/cmd/determined-agent/run_test.go +++ b/agent/cmd/determined-agent/run_test.go @@ -20,6 +20,7 @@ const DefaultRawConfig = ` log: level: trace color: true +resource_pool: default slot_type: auto security: tls: @@ -100,6 +101,7 @@ func TestMergeAgentConfigViaNewViper(t *testing.T) { log: level: trace color: true +resource_pool: default slot_type: auto security: tls: @@ -157,6 +159,7 @@ func TestMergeAgentConfigViaViperWithDefaults(t *testing.T) { log: level: trace color: true +resource_pool: default slot_type: auto security: tls: @@ -213,6 +216,7 @@ func TestMergeAgentConfigViaViperWithDefaultsEnvAndFlags(t *testing.T) { log: level: trace color: true +resource_pool: default slot_type: auto security: tls: diff --git a/agent/internal/options/options_test.go b/agent/internal/options/options_test.go index 3c114a2f513..8bc6f70813d 100644 --- a/agent/internal/options/options_test.go +++ b/agent/internal/options/options_test.go @@ -58,6 +58,7 @@ log: log: level: trace color: true +resource_pool: default slot_type: auto security: tls: diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py index 9b61e3a30d1..2b12723ca7a 100644 --- a/e2e_tests/tests/cluster/conftest.py +++ b/e2e_tests/tests/cluster/conftest.py @@ -6,6 +6,8 @@ managed_cluster_session, managed_cluster_session_priority_scheduler, restartable_managed_cluster, + managed_cluster_multi_resource_pools, + managed_cluster_session_multi_resource_pools, ) from .managed_cluster_k8s import k8s_managed_cluster # noqa from .managed_slurm_cluster import ( # noqa diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py index ea7a99fa592..c6c7c68632b 100644 --- a/e2e_tests/tests/cluster/managed_cluster.py +++ b/e2e_tests/tests/cluster/managed_cluster.py @@ -15,6 +15,9 @@ DEVCLUSTER_REATTACH_OFF_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "double.devcluster.yaml" DEVCLUSTER_REATTACH_ON_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "double-reattach.devcluster.yaml" DEVCLUSTER_PRIORITY_SCHEDULER_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "priority.devcluster.yaml" +DEVCLUSTER_MULTI_RP_CONFIG_PATH = ( + DEVCLUSTER_CONFIG_ROOT_PATH / "multi-resource-pools.devcluster.yaml" +) def get_agent_data(sess: api.Session) -> List[Dict[str, Any]]: @@ -184,7 +187,7 @@ def managed_cluster_restarts( managed_cluster_session: ManagedCluster, request: Any ) -> Iterator[ManagedCluster]: # check if priority scheduler or not using config. config = str(DEVCLUSTER_REATTACH_ON_CONFIG_PATH) - # port number is same for both reattach on and off config files so you can use either. + # port number is same for both reattach on and off config files or multi rp so you can use any. utils.set_master_port(config) nodeid = request.node.nodeid managed_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n") @@ -204,3 +207,27 @@ def restartable_managed_cluster( managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent() raise + + +@pytest.fixture(scope="session") +def managed_cluster_session_multi_resource_pools(request: Any) -> Iterator[ManagedCluster]: + config = str(DEVCLUSTER_MULTI_RP_CONFIG_PATH) + with ManagedCluster(config) as mc: + mc.initial_startup() + yield mc + + +@pytest.fixture +def managed_cluster_multi_resource_pools( + managed_cluster_session_multi_resource_pools: ManagedCluster, request: Any +) -> Iterator[ManagedCluster]: + config = str(DEVCLUSTER_MULTI_RP_CONFIG_PATH) + utils.set_master_port(config) + nodeid = request.node.nodeid + managed_cluster_session_multi_resource_pools.log_marker( + f"pytest [{utils.now_ts()}] {nodeid} setup\n" + ) + yield managed_cluster_session_multi_resource_pools + managed_cluster_session_multi_resource_pools.log_marker( + f"pytest [{utils.now_ts()}] {nodeid} teardown\n" + ) diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 294f04fd41b..3fdac2e24f0 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -760,12 +760,12 @@ def test_master_restart_with_queued( @pytest.mark.managed_devcluster def test_agent_resource_pool_change( - restartable_managed_cluster: managed_cluster.ManagedCluster, + managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: - restartable_managed_cluster.kill_agent() - restartable_managed_cluster.dc.restart_stage("agent20") + managed_cluster_multi_resource_pools.kill_agent() + managed_cluster_multi_resource_pools.dc.restart_stage("agent10") for _i in range(5): agent_data = managed_cluster.get_agent_data(admin) @@ -777,18 +777,18 @@ def test_agent_resource_pool_change( f"agent with different resource pool is still present after {_i} ticks:{agent_data}" ) finally: - restartable_managed_cluster.dc.kill_stage("agent20") - restartable_managed_cluster.restart_agent() + managed_cluster_multi_resource_pools.dc.kill_stage("agent10") + managed_cluster_multi_resource_pools.restart_agent() @pytest.mark.managed_devcluster def test_agent_resource_pool_unchanged( - restartable_managed_cluster: managed_cluster.ManagedCluster, + managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: - restartable_managed_cluster.kill_agent() - restartable_managed_cluster.dc.restart_stage("agent30") + managed_cluster_multi_resource_pools.kill_agent() + managed_cluster_multi_resource_pools.dc.restart_stage("agent20") for _i in range(5): agent_data = managed_cluster.get_agent_data(admin) @@ -797,5 +797,5 @@ def test_agent_resource_pool_unchanged( # which is not expected. pytest.fail("agent exploded even with the same resource pool") finally: - restartable_managed_cluster.dc.kill_stage("agent30") - restartable_managed_cluster.restart_agent() + managed_cluster_multi_resource_pools.dc.kill_stage("agent20") + managed_cluster_multi_resource_pools.restart_agent() From 47bea5b183385c00f1250c8d8170f20de4538d44 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Sun, 18 Aug 2024 22:33:53 -0700 Subject: [PATCH 06/16] correct e2e test with new fixture --- e2e_tests/tests/cluster/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py index 2b12723ca7a..bbaba0fbc31 100644 --- a/e2e_tests/tests/cluster/conftest.py +++ b/e2e_tests/tests/cluster/conftest.py @@ -1,9 +1,11 @@ """Makes managed_cluster fixtures available to all files in the directory""" from .managed_cluster import ( # noqa + managed_cluster_multi_resource_pools, managed_cluster_priority_scheduler, managed_cluster_restarts, managed_cluster_session, + managed_cluster_session_multi_resource_pools, managed_cluster_session_priority_scheduler, restartable_managed_cluster, managed_cluster_multi_resource_pools, From d93bb752648015ed91a51b099271a65749f06e8c Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Sun, 18 Aug 2024 22:48:44 -0700 Subject: [PATCH 07/16] Sort in conftest.py --- e2e_tests/tests/cluster/conftest.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py index bbaba0fbc31..2dc82af4d25 100644 --- a/e2e_tests/tests/cluster/conftest.py +++ b/e2e_tests/tests/cluster/conftest.py @@ -8,8 +8,6 @@ managed_cluster_session_multi_resource_pools, managed_cluster_session_priority_scheduler, restartable_managed_cluster, - managed_cluster_multi_resource_pools, - managed_cluster_session_multi_resource_pools, ) from .managed_cluster_k8s import k8s_managed_cluster # noqa from .managed_slurm_cluster import ( # noqa From 556220b9c90e5212af54156f4de18b8e316f78fa Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Mon, 19 Aug 2024 00:08:36 -0700 Subject: [PATCH 08/16] create restartable managed cluster --- e2e_tests/tests/cluster/conftest.py | 1 + e2e_tests/tests/cluster/managed_cluster.py | 14 +++++++++++++ .../tests/cluster/test_master_restart.py | 20 +++++++++---------- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py index 2dc82af4d25..ba545dd0546 100644 --- a/e2e_tests/tests/cluster/conftest.py +++ b/e2e_tests/tests/cluster/conftest.py @@ -8,6 +8,7 @@ managed_cluster_session_multi_resource_pools, managed_cluster_session_priority_scheduler, restartable_managed_cluster, + restartable_managed_cluster_multi_resource_pools, ) from .managed_cluster_k8s import k8s_managed_cluster # noqa from .managed_slurm_cluster import ( # noqa diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py index c6c7c68632b..d62275c7ded 100644 --- a/e2e_tests/tests/cluster/managed_cluster.py +++ b/e2e_tests/tests/cluster/managed_cluster.py @@ -231,3 +231,17 @@ def managed_cluster_multi_resource_pools( managed_cluster_session_multi_resource_pools.log_marker( f"pytest [{utils.now_ts()}] {nodeid} teardown\n" ) + + +@pytest.fixture +def restartable_managed_cluster_multi_resource_pools( + managed_cluster_multi_resource_pools: ManagedCluster, +) -> Iterator[ManagedCluster]: + managed_cluster_multi_resource_pools.wait_for_agent_ok(20) + try: + yield managed_cluster_multi_resource_pools + managed_cluster_multi_resource_pools.wait_for_agent_ok(20) + except Exception: + managed_cluster_multi_resource_pools.restart_master() + managed_cluster_multi_resource_pools.restart_agent() + raise diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 3fdac2e24f0..053a38f8b1d 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -760,12 +760,12 @@ def test_master_restart_with_queued( @pytest.mark.managed_devcluster def test_agent_resource_pool_change( - managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: - managed_cluster_multi_resource_pools.kill_agent() - managed_cluster_multi_resource_pools.dc.restart_stage("agent10") + restartable_managed_cluster_multi_resource_pools.kill_agent() + restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10") for _i in range(5): agent_data = managed_cluster.get_agent_data(admin) @@ -777,18 +777,18 @@ def test_agent_resource_pool_change( f"agent with different resource pool is still present after {_i} ticks:{agent_data}" ) finally: - managed_cluster_multi_resource_pools.dc.kill_stage("agent10") - managed_cluster_multi_resource_pools.restart_agent() + restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10") + restartable_managed_cluster_multi_resource_pools.restart_agent() @pytest.mark.managed_devcluster def test_agent_resource_pool_unchanged( - managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: - managed_cluster_multi_resource_pools.kill_agent() - managed_cluster_multi_resource_pools.dc.restart_stage("agent20") + restartable_managed_cluster_multi_resource_pools.kill_agent() + restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20") for _i in range(5): agent_data = managed_cluster.get_agent_data(admin) @@ -797,5 +797,5 @@ def test_agent_resource_pool_unchanged( # which is not expected. pytest.fail("agent exploded even with the same resource pool") finally: - managed_cluster_multi_resource_pools.dc.kill_stage("agent20") - managed_cluster_multi_resource_pools.restart_agent() + restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20") + restartable_managed_cluster_multi_resource_pools.restart_agent() From b3eb2d026f5bb1c18124aa35b84975131978477d Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Mon, 19 Aug 2024 10:57:06 -0700 Subject: [PATCH 09/16] change port for connection --- .../multi-resource-pools.devcluster.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml index 54986a82896..42effbd2427 100644 --- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml +++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml @@ -1,6 +1,7 @@ stages: - db: name: db + port: 5436 - master: pre: @@ -10,7 +11,7 @@ stages: initial_user_password: $INITIAL_USER_PASSWORD db: host: localhost - port: 5432 + port: 5436 password: postgres user: postgres name: determined @@ -56,16 +57,16 @@ stages: - custom: name: proxy - cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"] + cmd: ["socat", "-d", "-d", "TCP-LISTEN:8083,reuseaddr,fork", "TCP:localhost:8080"] post: - conncheck: - port: 8081 + port: 8083 - agent: name: agent1 config_file: master_host: 127.0.0.1 - master_port: 8081 + master_port: 8083 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -79,7 +80,7 @@ stages: name: agent2 config_file: master_host: 127.0.0.1 - master_port: 8081 + master_port: 8083 agent_id: agent2 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -90,7 +91,7 @@ stages: name: agent10 # Copy of agent1, but with different resource pool. config_file: master_host: 127.0.0.1 - master_port: 8081 + master_port: 8083 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -102,7 +103,7 @@ stages: name: agent20 # Copy of agent1, but with empty(default) resource pool. config_file: master_host: 127.0.0.1 - master_port: 8081 + master_port: 8083 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 From 69920f5e714a1635d09b79e6a03617b66fc4e892 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 21 Aug 2024 21:32:16 -0700 Subject: [PATCH 10/16] Test new mark --- .circleci/real_config.yml | 14 +++++ e2e_tests/pytest.ini | 1 + .../tests/cluster/test_master_restart.py | 56 +++++++++++++++++-- e2e_tests/tests/conftest.py | 1 + 4 files changed, 68 insertions(+), 4 deletions(-) diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index d31bf390743..a9959a76190 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -4512,6 +4512,20 @@ workflows: extra-pytest-flags: "--no-compare-stats" collect-det-job-logs: false + - test-e2e: + name: test-e2e-managed-devcluster-resource-pools + context: + - dev-ci-cluster-default-user-credentials + requires: + - build-go-ee + parallelism: 4 + resource-class: large + tf2: true + mark: managed_devcluster_resource_pools + managed-devcluster: true + extra-pytest-flags: "--no-compare-stats" + collect-det-job-logs: false + - test-e2e: name: test-e2e-multi-k8s context: diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini index 44c0c56e259..153cecd22ff 100644 --- a/e2e_tests/pytest.ini +++ b/e2e_tests/pytest.ini @@ -37,6 +37,7 @@ markers = nightly: nightly tests det_deploy_local: test det deploy local managed_devcluster: cluster tests that require a pytest-side managed cluster + managed_devcluster_resource_pools: cluster tests that require a pytest-side managed cluster with multiple resource pools port_registry: tests for port registry and unique port offset distributed_quarantine: distributed training tests (quarantine) diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 053a38f8b1d..254c8e38635 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -14,6 +14,7 @@ from tests import detproc from tests import experiment as exp from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils +# from tests.cluster import managed_devcluster_resource_pools from tests.task import task logger = logging.getLogger(__name__) @@ -758,9 +759,10 @@ def test_master_restart_with_queued( utils.assert_command_succeeded(sess, cmd_id) -@pytest.mark.managed_devcluster +@pytest.mark.managed_devcluster_resource_pools def test_agent_resource_pool_change( - restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: + managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: @@ -781,9 +783,10 @@ def test_agent_resource_pool_change( restartable_managed_cluster_multi_resource_pools.restart_agent() -@pytest.mark.managed_devcluster +@pytest.mark.managed_devcluster_resource_pools def test_agent_resource_pool_unchanged( - restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: + managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: @@ -799,3 +802,48 @@ def test_agent_resource_pool_unchanged( finally: restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20") restartable_managed_cluster_multi_resource_pools.restart_agent() + + +# @pytest.mark.managed_devcluster_resource_pools +# def test_agent_resource_pool_change( +# restartable_managed_cluster_multi_resource_pools: +# managed_devcluster_resource_pools.ManagedDevclusterResourcePools, +# ) -> None: +# admin = api_utils.admin_session() +# try: +# restartable_managed_cluster_multi_resource_pools.kill_agent() +# restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10") + +# for _i in range(5): +# agent_data = managed_cluster.get_agent_data(admin) +# if len(agent_data) == 0: +# # Agent has exploded and been wiped due to resource pool mismatch, as expected. +# break +# else: +# pytest.fail( +# f"agent with different resource pool is still present after {_i} ticks:{agent_data}" +# ) +# finally: +# restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10") +# restartable_managed_cluster_multi_resource_pools.restart_agent() + + +# @pytest.mark.managed_devcluster_resource_pools +# def test_agent_resource_pool_unchanged( +# restartable_managed_cluster_multi_resource_pools: +# managed_devcluster_resource_pools.ManagedDevclusterResourcePools, +# ) -> None: +# admin = api_utils.admin_session() +# try: +# restartable_managed_cluster_multi_resource_pools.kill_agent() +# restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20") + +# for _i in range(5): +# agent_data = managed_cluster.get_agent_data(admin) +# if len(agent_data) == 0: +# # Agent has exploded and been wiped due to resource pool mismatch, +# # which is not expected. +# pytest.fail("agent exploded even with the same resource pool") +# finally: +# restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20") +# restartable_managed_cluster_multi_resource_pools.restart_agent() diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py index 0c8a2498ce4..d0c96bf53d3 100644 --- a/e2e_tests/tests/conftest.py +++ b/e2e_tests/tests/conftest.py @@ -49,6 +49,7 @@ "model_hub_mmdetection", "deepspeed", "managed_devcluster", + "managed_devcluster_resource_pools", "port_registry", "distributed_quarantine", "det_deploy_local_quarantine", From 27825d2410ad7e8887190c8c3932997d3f8fc635 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 21 Aug 2024 22:16:42 -0700 Subject: [PATCH 11/16] remove comments/unused lines --- .../multi-resource-pools.devcluster.yaml | 16 +++--- .../tests/cluster/test_master_restart.py | 52 +------------------ 2 files changed, 10 insertions(+), 58 deletions(-) diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml index 42effbd2427..833dc58d34b 100644 --- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml +++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml @@ -1,7 +1,7 @@ stages: - db: name: db - port: 5436 + port: 5434 - master: pre: @@ -11,7 +11,7 @@ stages: initial_user_password: $INITIAL_USER_PASSWORD db: host: localhost - port: 5436 + port: 5434 password: postgres user: postgres name: determined @@ -57,16 +57,16 @@ stages: - custom: name: proxy - cmd: ["socat", "-d", "-d", "TCP-LISTEN:8083,reuseaddr,fork", "TCP:localhost:8080"] + cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"] post: - conncheck: - port: 8083 + port: 8081 - agent: name: agent1 config_file: master_host: 127.0.0.1 - master_port: 8083 + master_port: 8081 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -80,7 +80,7 @@ stages: name: agent2 config_file: master_host: 127.0.0.1 - master_port: 8083 + master_port: 8081 agent_id: agent2 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -91,7 +91,7 @@ stages: name: agent10 # Copy of agent1, but with different resource pool. config_file: master_host: 127.0.0.1 - master_port: 8083 + master_port: 8081 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 @@ -103,7 +103,7 @@ stages: name: agent20 # Copy of agent1, but with empty(default) resource pool. config_file: master_host: 127.0.0.1 - master_port: 8083 + master_port: 8081 agent_id: agent1 container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py index 254c8e38635..8e95025cdea 100644 --- a/e2e_tests/tests/cluster/test_master_restart.py +++ b/e2e_tests/tests/cluster/test_master_restart.py @@ -14,7 +14,6 @@ from tests import detproc from tests import experiment as exp from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils -# from tests.cluster import managed_devcluster_resource_pools from tests.task import task logger = logging.getLogger(__name__) @@ -761,8 +760,7 @@ def test_master_restart_with_queued( @pytest.mark.managed_devcluster_resource_pools def test_agent_resource_pool_change( - restartable_managed_cluster_multi_resource_pools: - managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: @@ -785,8 +783,7 @@ def test_agent_resource_pool_change( @pytest.mark.managed_devcluster_resource_pools def test_agent_resource_pool_unchanged( - restartable_managed_cluster_multi_resource_pools: - managed_cluster.ManagedCluster, + restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster, ) -> None: admin = api_utils.admin_session() try: @@ -802,48 +799,3 @@ def test_agent_resource_pool_unchanged( finally: restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20") restartable_managed_cluster_multi_resource_pools.restart_agent() - - -# @pytest.mark.managed_devcluster_resource_pools -# def test_agent_resource_pool_change( -# restartable_managed_cluster_multi_resource_pools: -# managed_devcluster_resource_pools.ManagedDevclusterResourcePools, -# ) -> None: -# admin = api_utils.admin_session() -# try: -# restartable_managed_cluster_multi_resource_pools.kill_agent() -# restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10") - -# for _i in range(5): -# agent_data = managed_cluster.get_agent_data(admin) -# if len(agent_data) == 0: -# # Agent has exploded and been wiped due to resource pool mismatch, as expected. -# break -# else: -# pytest.fail( -# f"agent with different resource pool is still present after {_i} ticks:{agent_data}" -# ) -# finally: -# restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10") -# restartable_managed_cluster_multi_resource_pools.restart_agent() - - -# @pytest.mark.managed_devcluster_resource_pools -# def test_agent_resource_pool_unchanged( -# restartable_managed_cluster_multi_resource_pools: -# managed_devcluster_resource_pools.ManagedDevclusterResourcePools, -# ) -> None: -# admin = api_utils.admin_session() -# try: -# restartable_managed_cluster_multi_resource_pools.kill_agent() -# restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20") - -# for _i in range(5): -# agent_data = managed_cluster.get_agent_data(admin) -# if len(agent_data) == 0: -# # Agent has exploded and been wiped due to resource pool mismatch, -# # which is not expected. -# pytest.fail("agent exploded even with the same resource pool") -# finally: -# restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20") -# restartable_managed_cluster_multi_resource_pools.restart_agent() From 178a0969038aa036a8742f2f58c3c2b4842ee7ad Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 21 Aug 2024 22:19:56 -0700 Subject: [PATCH 12/16] minor correction --- e2e_tests/tests/cluster/managed_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py index d62275c7ded..c31484b6ef9 100644 --- a/e2e_tests/tests/cluster/managed_cluster.py +++ b/e2e_tests/tests/cluster/managed_cluster.py @@ -187,7 +187,7 @@ def managed_cluster_restarts( managed_cluster_session: ManagedCluster, request: Any ) -> Iterator[ManagedCluster]: # check if priority scheduler or not using config. config = str(DEVCLUSTER_REATTACH_ON_CONFIG_PATH) - # port number is same for both reattach on and off config files or multi rp so you can use any. + # port number is same for both reattach on and off config files so you can use either. utils.set_master_port(config) nodeid = request.node.nodeid managed_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n") From 4877be6feaea23fcec8477639ed3ff69c6c3c84d Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Tue, 27 Aug 2024 18:31:06 -0700 Subject: [PATCH 13/16] changes based on comments --- .../devcluster/double-reattach.devcluster.yaml | 2 +- master/internal/rm/agentrm/agent.go | 2 +- master/internal/rm/agentrm/agent_state_test.go | 16 ++++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml index 4aebab4c76b..a7111202fa8 100644 --- a/.circleci/devcluster/double-reattach.devcluster.yaml +++ b/.circleci/devcluster/double-reattach.devcluster.yaml @@ -92,4 +92,4 @@ stages: agent_reconnect_attempts: 24 agent_reconnect_backoff: 5 container_auto_remove_disabled: true - artificial_slots: 4 \ No newline at end of file + artificial_slots: 4 diff --git a/master/internal/rm/agentrm/agent.go b/master/internal/rm/agentrm/agent.go index 7c762565e05..ac87d2f427c 100644 --- a/master/internal/rm/agentrm/agent.go +++ b/master/internal/rm/agentrm/agent.go @@ -623,7 +623,7 @@ func (a *agent) HandleIncomingWebsocketMessage(msg *aproto.MasterMessage) { resourcePoolErr := a.agentState.checkAgentResourcePoolMatch(msg.AgentStarted) if resourcePoolErr != nil { a.syslog.WithError(resourcePoolErr). - Error("change in agent resource pool was detected") + Error("change in agent resource pool was detected during reconnect") a.socket.Outbox <- aproto.AgentMessage{ AgentShutdown: &aproto.AgentShutdown{ ErrMsg: aproto.ErrAgentMustReconnect.Error(), diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go index 921c876c337..217c44c29d7 100644 --- a/master/internal/rm/agentrm/agent_state_test.go +++ b/master/internal/rm/agentrm/agent_state_test.go @@ -324,6 +324,10 @@ func Test_agentState_checkAgentStartedDevicesMatch(t *testing.T) { } func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { + const ( + poolOne = "pool1" + poolTwo = "pool2" + ) tests := []struct { name string state agentState @@ -333,17 +337,17 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { { name: "resource pool name match", state: agentState{ - resourcePoolName: "pool1", + resourcePoolName: poolOne, }, agentStarted: &aproto.AgentStarted{ - ResourcePoolName: "pool1", + ResourcePoolName: poolOne, }, wantErrContains: "", }, { name: "resource pool name is missing", state: agentState{ - resourcePoolName: "pool1", + resourcePoolName: poolOne, }, agentStarted: &aproto.AgentStarted{ResourcePoolName: defaultResourcePoolName}, wantErrContains: "resource pool has changed", @@ -353,16 +357,16 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) { state: agentState{ resourcePoolName: defaultResourcePoolName, }, - agentStarted: &aproto.AgentStarted{ResourcePoolName: "pool1"}, + agentStarted: &aproto.AgentStarted{ResourcePoolName: poolOne}, wantErrContains: "resource pool has changed", }, { name: "mismatched resource pool name", state: agentState{ - resourcePoolName: "pool1", + resourcePoolName: poolOne, }, agentStarted: &aproto.AgentStarted{ - ResourcePoolName: "pool2", + ResourcePoolName: poolTwo, }, wantErrContains: "resource pool has changed", }, From 534228125cfe71af1ad1bc9129eb22cd9ee4b85f Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Tue, 27 Aug 2024 18:32:27 -0700 Subject: [PATCH 14/16] add newline --- .circleci/devcluster/multi-resource-pools.devcluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml index 833dc58d34b..5529e251148 100644 --- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml +++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml @@ -108,4 +108,4 @@ stages: container_master_host: $DOCKER_LOCALHOST agent_reconnect_attempts: 24 agent_reconnect_backoff: 5 - container_auto_remove_disabled: true \ No newline at end of file + container_auto_remove_disabled: true From f329db484c1cd286e32858aaf5fb21adff762120 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 28 Aug 2024 12:38:08 -0700 Subject: [PATCH 15/16] add note --- docs/get-started/architecture/introduction.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/get-started/architecture/introduction.rst b/docs/get-started/architecture/introduction.rst index 1a4585cb65d..bc13ae9022e 100644 --- a/docs/get-started/architecture/introduction.rst +++ b/docs/get-started/architecture/introduction.rst @@ -391,6 +391,11 @@ If you are using static resource pools and launching agents by hand, you will ne :ref:`agent configuration ` to specify which resource pool the agent should join. +Note that to change an agent's assigned resource_pool after it has already joined one, you need to +update the :ref:`agent configuration `. Make sure to drain the agents +properly before modifying the resource_pool. After making the changes, restart the agent to connect +it to the new resource pool. + Migrate to Resource Pools ------------------------- From 2cbf48f2443432304c6882a9b73544ac2a7ee0d2 Mon Sep 17 00:00:00 2001 From: ShreyaLnuHpe Date: Wed, 28 Aug 2024 13:10:08 -0700 Subject: [PATCH 16/16] correct the note --- docs/get-started/architecture/introduction.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/get-started/architecture/introduction.rst b/docs/get-started/architecture/introduction.rst index bc13ae9022e..815e4fbd261 100644 --- a/docs/get-started/architecture/introduction.rst +++ b/docs/get-started/architecture/introduction.rst @@ -391,10 +391,10 @@ If you are using static resource pools and launching agents by hand, you will ne :ref:`agent configuration ` to specify which resource pool the agent should join. -Note that to change an agent's assigned resource_pool after it has already joined one, you need to -update the :ref:`agent configuration `. Make sure to drain the agents -properly before modifying the resource_pool. After making the changes, restart the agent to connect -it to the new resource pool. +To change the resource pool an agent is assigned to after it has already joined one, you need to +update the :ref:`agent configuration `. Before making this change, ensure +the agents are properly drained. Once the configuration is updated, restart the agent to connect it +to the new resource pool. Migrate to Resource Pools -------------------------