From d3caacd636684199383cec55f50f3ead255c8e79 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Tue, 13 Aug 2024 09:23:02 -0700
Subject: [PATCH 01/16] fix: reject reconnecting agents with different resource
 pool configuration

---
 agent/internal/agent.go                       |  2 +
 master/internal/rm/agentrm/agent.go           | 13 +++++
 master/internal/rm/agentrm/agent_state.go     | 14 ++++++
 .../internal/rm/agentrm/agent_state_test.go   | 50 +++++++++++++++++++
 master/pkg/aproto/master_message.go           |  1 +
 5 files changed, 80 insertions(+)

diff --git a/agent/internal/agent.go b/agent/internal/agent.go
index 80693407d32..e30003f92bf 100644
--- a/agent/internal/agent.go
+++ b/agent/internal/agent.go
@@ -160,6 +160,7 @@ func (a *Agent) run(ctx context.Context) error {
 		Version:              a.version,
 		Devices:              devices,
 		ContainersReattached: reattached,
+		ResourcePoolName:     a.opts.ResourcePool,
 	}}:
 	case <-ctx.Done():
 		return ctx.Err()
@@ -351,6 +352,7 @@ func (a *Agent) reconnectFlow(
 		Version:              a.version,
 		Devices:              devices,
 		ContainersReattached: reattached,
+		ResourcePoolName:     a.opts.ResourcePool,
 	}}:
 	case <-ctx.Done():
 		return nil, nil, ctx.Err()
diff --git a/master/internal/rm/agentrm/agent.go b/master/internal/rm/agentrm/agent.go
index a5465898910..7c762565e05 100644
--- a/master/internal/rm/agentrm/agent.go
+++ b/master/internal/rm/agentrm/agent.go
@@ -620,6 +620,19 @@ func (a *agent) HandleIncomingWebsocketMessage(msg *aproto.MasterMessage) {
 				a.stop(err)
 				return
 			}
+			resourcePoolErr := a.agentState.checkAgentResourcePoolMatch(msg.AgentStarted)
+			if resourcePoolErr != nil {
+				a.syslog.WithError(resourcePoolErr).
+					Error("change in agent resource pool was detected")
+				a.socket.Outbox <- aproto.AgentMessage{
+					AgentShutdown: &aproto.AgentShutdown{
+						ErrMsg: aproto.ErrAgentMustReconnect.Error(),
+					},
+				}
+
+				a.stop(resourcePoolErr)
+				return
+			}
 		} else {
 			a.agentStarted(msg.AgentStarted)
 		}
diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go
index 0a5941173db..fd5e43a7964 100644
--- a/master/internal/rm/agentrm/agent_state.go
+++ b/master/internal/rm/agentrm/agent_state.go
@@ -289,6 +289,20 @@ func (a *agentState) checkAgentStartedDevicesMatch(
 	return nil
 }
 
+func (a *agentState) checkAgentResourcePoolMatch(
+	agentStarted *aproto.AgentStarted,
+) error {
+	// We need to compare the resource pool configurations between the agent and the master.
+	// Ideally, the master doesn't request new resource pool information if the agent reconnects
+	// within the designated reconnection period, while the agent should read from its updated configuration.
+	// If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart.
+	if a.resourcePoolName != agentStarted.ResourcePoolName {
+		return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName)
+	}
+
+	return nil
+}
+
 func (a *agentState) containerStateChanged(msg aproto.ContainerStateChanged) {
 	for _, d := range msg.Container.Devices {
 		s, ok := a.slotStates[d.ID]
diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go
index 3341f83f9d0..56a9993bd33 100644
--- a/master/internal/rm/agentrm/agent_state_test.go
+++ b/master/internal/rm/agentrm/agent_state_test.go
@@ -70,6 +70,7 @@ func TestAgentStatePersistence(t *testing.T) {
 		Version:              "",
 		Devices:              devices,
 		ContainersReattached: []aproto.ContainerReattachAck{},
+		ResourcePoolName:     "",
 	}
 	state.agentStarted(started)
 	require.Len(t, state.getSlotsSummary("/myagent"), 2)
@@ -322,6 +323,54 @@ func Test_agentState_checkAgentStartedDevicesMatch(t *testing.T) {
 	}
 }
 
+func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
+	tests := []struct {
+		name            string
+		state           agentState
+		agentStarted    *aproto.AgentStarted
+		wantErrContains string
+	}{
+		{
+			name: "resource pool name match",
+			state: agentState{
+				resourcePoolName: "pool1",
+			},
+			agentStarted: &aproto.AgentStarted{
+				ResourcePoolName: "pool1",
+			},
+			wantErrContains: "",
+		},
+		{
+			name: "resource pool name is missing",
+			state: agentState{
+				resourcePoolName: "pool1",
+			},
+			agentStarted:    &aproto.AgentStarted{ResourcePoolName: ""},
+			wantErrContains: "resource pool has changed",
+		},
+		{
+			name: "mismatched resource pool name",
+			state: agentState{
+				resourcePoolName: "pool1",
+			},
+			agentStarted: &aproto.AgentStarted{
+				ResourcePoolName: "pool2",
+			},
+			wantErrContains: "resource pool has changed",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := tt.state.checkAgentResourcePoolMatch(tt.agentStarted)
+			if tt.wantErrContains == "" {
+				require.NoError(t, err)
+				return
+			}
+			require.ErrorContains(t, err, tt.wantErrContains)
+		})
+	}
+}
+
 func TestSlotStates(t *testing.T) {
 	rpName := "test"
 	state := newAgentState(aproto.ID(uuid.NewString()), 64)
@@ -345,6 +394,7 @@ func TestSlotStates(t *testing.T) {
 		Version:              "",
 		Devices:              devices,
 		ContainersReattached: []aproto.ContainerReattachAck{},
+		ResourcePoolName:     "",
 	}
 	state.agentStarted(started)
 	slots := state.getSlotsSummary("/")
diff --git a/master/pkg/aproto/master_message.go b/master/pkg/aproto/master_message.go
index edbefb08b1c..ea2385377b6 100644
--- a/master/pkg/aproto/master_message.go
+++ b/master/pkg/aproto/master_message.go
@@ -80,6 +80,7 @@ type AgentStarted struct {
 	Version              string
 	Devices              []device.Device
 	ContainersReattached []ContainerReattachAck
+	ResourcePoolName     string
 }
 
 // ContainerStateChanged notifies the master that the agent transitioned the container state.

From b4a24f79ebe5b461d512d7dced83b0d54d3c8851 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Tue, 13 Aug 2024 22:47:38 -0700
Subject: [PATCH 02/16] add e2e test

---
 .../double-reattach.devcluster.yaml           | 16 +++++++++++++
 .../tests/cluster/test_master_restart.py      | 23 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml
index a7111202fa8..7333c64ff57 100644
--- a/.circleci/devcluster/double-reattach.devcluster.yaml
+++ b/.circleci/devcluster/double-reattach.devcluster.yaml
@@ -43,6 +43,10 @@ stages:
             pool_name: default
             provider: null
             task_container_defaults: null
+          - pool_name: pool1
+            max_slots: 8
+            scheduler:
+              type: priority
         scim:
           enabled: true
           auth:
@@ -93,3 +97,15 @@ stages:
         agent_reconnect_backoff: 5
         container_auto_remove_disabled: true
         artificial_slots: 4
+
+  - agent:
+      name: agent20 # Copy of agent1, but with different resource pool.
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent1
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
+        resource_pool: pool1
\ No newline at end of file
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index cc20d4931e4..039c2a21bf6 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -710,3 +710,26 @@ def test_master_restart_with_queued(
     for cmd_id in [running_command_id, queued_command_id]:
         utils.wait_for_command_state(sess, cmd_id, "TERMINATED", 90)
         utils.assert_command_succeeded(sess, cmd_id)
+
+
+@pytest.mark.managed_devcluster
+def test_agent_resource_pool_change(
+    restartable_managed_cluster: managed_cluster.ManagedCluster,
+) -> None:
+    admin = api_utils.admin_session()
+    try:
+        restartable_managed_cluster.kill_agent()
+        restartable_managed_cluster.dc.restart_stage("agent20")
+
+        for _i in range(5):
+            agent_data = managed_cluster.get_agent_data(admin)
+            if len(agent_data) == 0:
+                # Agent has exploded and been wiped due to resource pool mismatch, as expected.
+                break
+        else:
+            pytest.fail(
+                f"agent with different resource pool is still present after {_i} ticks:{agent_data}"
+            )
+    finally:
+        restartable_managed_cluster.dc.kill_stage("agent20")
+        restartable_managed_cluster.restart_agent()

From 2a4d4f2d1b52df7be70d12a01f40dd0ef133d931 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 14 Aug 2024 15:01:38 -0700
Subject: [PATCH 03/16] add extra test cases

---
 .../double-reattach.devcluster.yaml           | 13 +++++++++-
 .../tests/cluster/test_master_restart.py      | 20 ++++++++++++++
 master/internal/rm/agentrm/agent_state.go     | 16 ++++++++----
 .../internal/rm/agentrm/agent_state_test.go   | 26 +++++++++++++++++++
 .../rm/agentrm/resource_managers_test.go      |  2 --
 5 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml
index 7333c64ff57..59026d25f58 100644
--- a/.circleci/devcluster/double-reattach.devcluster.yaml
+++ b/.circleci/devcluster/double-reattach.devcluster.yaml
@@ -108,4 +108,15 @@ stages:
         agent_reconnect_attempts: 24
         agent_reconnect_backoff: 5
         container_auto_remove_disabled: true
-        resource_pool: pool1
\ No newline at end of file
+        resource_pool: pool1
+  
+  - agent:
+      name: agent30 # Copy of agent1, but with empty(default) resource pool.
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent1
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
\ No newline at end of file
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 1bd9fb6423e..294f04fd41b 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -779,3 +779,23 @@ def test_agent_resource_pool_change(
     finally:
         restartable_managed_cluster.dc.kill_stage("agent20")
         restartable_managed_cluster.restart_agent()
+
+
+@pytest.mark.managed_devcluster
+def test_agent_resource_pool_unchanged(
+    restartable_managed_cluster: managed_cluster.ManagedCluster,
+) -> None:
+    admin = api_utils.admin_session()
+    try:
+        restartable_managed_cluster.kill_agent()
+        restartable_managed_cluster.dc.restart_stage("agent30")
+
+        for _i in range(5):
+            agent_data = managed_cluster.get_agent_data(admin)
+            if len(agent_data) == 0:
+                # Agent has exploded and been wiped due to resource pool mismatch,
+                # which is not expected.
+                pytest.fail("agent exploded even with the same resource pool")
+    finally:
+        restartable_managed_cluster.dc.kill_stage("agent30")
+        restartable_managed_cluster.restart_agent()
diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go
index fd5e43a7964..da45dbbf383 100644
--- a/master/internal/rm/agentrm/agent_state.go
+++ b/master/internal/rm/agentrm/agent_state.go
@@ -20,6 +20,8 @@ import (
 	"github.com/determined-ai/determined/master/pkg/model"
 )
 
+const defaultResourcePoolName = "default"
+
 type slotEnabled struct {
 	deviceAdded  bool
 	agentEnabled bool
@@ -289,14 +291,18 @@ func (a *agentState) checkAgentStartedDevicesMatch(
 	return nil
 }
 
+// We need to compare the resource pool configurations between the agent and the master.
+// Ideally, the master doesn't request new resource pool information if the agent reconnects
+// within the designated reconnection period, while the agent should read from its updated configuration.
+// If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart.
 func (a *agentState) checkAgentResourcePoolMatch(
 	agentStarted *aproto.AgentStarted,
 ) error {
-	// We need to compare the resource pool configurations between the agent and the master.
-	// Ideally, the master doesn't request new resource pool information if the agent reconnects
-	// within the designated reconnection period, while the agent should read from its updated configuration.
-	// If there's a mismatch, an error will be thrown, causing the agent to stop and require a restart.
-	if a.resourcePoolName != agentStarted.ResourcePoolName {
+	// If the agent's resource pool is empty in the configuration and the master has it set to default,
+	// the agent should not be restarted. However, if the agent's resource pool differs from the master's record,
+	// the agent should be restarted.
+	if !(agentStarted.ResourcePoolName == "" && a.resourcePoolName == defaultResourcePoolName) &&
+		(a.resourcePoolName != agentStarted.ResourcePoolName) {
 		return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName)
 	}
 
diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go
index 56a9993bd33..79a0a9c0a2a 100644
--- a/master/internal/rm/agentrm/agent_state_test.go
+++ b/master/internal/rm/agentrm/agent_state_test.go
@@ -340,6 +340,16 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
 			},
 			wantErrContains: "",
 		},
+		{
+			name: "resource pool name match",
+			state: agentState{
+				resourcePoolName: "default",
+			},
+			agentStarted: &aproto.AgentStarted{
+				ResourcePoolName: "",
+			},
+			wantErrContains: "",
+		},
 		{
 			name: "resource pool name is missing",
 			state: agentState{
@@ -348,6 +358,22 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
 			agentStarted:    &aproto.AgentStarted{ResourcePoolName: ""},
 			wantErrContains: "resource pool has changed",
 		},
+		{
+			name: "resource pool name is missing",
+			state: agentState{
+				resourcePoolName: "default",
+			},
+			agentStarted:    &aproto.AgentStarted{ResourcePoolName: "pool1"},
+			wantErrContains: "resource pool has changed",
+		},
+		{
+			name: "resource pool name is missing",
+			state: agentState{
+				resourcePoolName: "",
+			},
+			agentStarted:    &aproto.AgentStarted{ResourcePoolName: "pool1"},
+			wantErrContains: "resource pool has changed",
+		},
 		{
 			name: "mismatched resource pool name",
 			state: agentState{
diff --git a/master/internal/rm/agentrm/resource_managers_test.go b/master/internal/rm/agentrm/resource_managers_test.go
index dea2af873ca..0cd82575b05 100644
--- a/master/internal/rm/agentrm/resource_managers_test.go
+++ b/master/internal/rm/agentrm/resource_managers_test.go
@@ -15,8 +15,6 @@ import (
 	"github.com/determined-ai/determined/master/internal/user"
 )
 
-const defaultResourcePoolName = "default"
-
 func TestResourceManagerForwardMessage(t *testing.T) {
 	user.InitService(nil, nil)
 	conf := &config.ResourceConfig{

From 8076a1b062b071bf81c3a69c149141acf539a60c Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 14 Aug 2024 16:48:11 -0700
Subject: [PATCH 04/16] defaulted resource pool in options

---
 agent/internal/options/options.go             |  1 +
 master/internal/rm/agentrm/agent_state.go     |  6 +----
 .../internal/rm/agentrm/agent_state_test.go   | 26 +++----------------
 3 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/agent/internal/options/options.go b/agent/internal/options/options.go
index 74d72b9e426..02120f639ae 100644
--- a/agent/internal/options/options.go
+++ b/agent/internal/options/options.go
@@ -28,6 +28,7 @@ func DefaultOptions() *Options {
 			Level: "trace",
 			Color: true,
 		},
+		ResourcePool:           "default",
 		SlotType:               "auto",
 		VisibleGPUs:            VisibleGPUsFromEnvironment(),
 		BindIP:                 "0.0.0.0",
diff --git a/master/internal/rm/agentrm/agent_state.go b/master/internal/rm/agentrm/agent_state.go
index da45dbbf383..6b4d6a4cd14 100644
--- a/master/internal/rm/agentrm/agent_state.go
+++ b/master/internal/rm/agentrm/agent_state.go
@@ -298,11 +298,7 @@ func (a *agentState) checkAgentStartedDevicesMatch(
 func (a *agentState) checkAgentResourcePoolMatch(
 	agentStarted *aproto.AgentStarted,
 ) error {
-	// If the agent's resource pool is empty in the configuration and the master has it set to default,
-	// the agent should not be restarted. However, if the agent's resource pool differs from the master's record,
-	// the agent should be restarted.
-	if !(agentStarted.ResourcePoolName == "" && a.resourcePoolName == defaultResourcePoolName) &&
-		(a.resourcePoolName != agentStarted.ResourcePoolName) {
+	if a.resourcePoolName != agentStarted.ResourcePoolName {
 		return fmt.Errorf("resource pool has changed: %s -> %s", a.resourcePoolName, agentStarted.ResourcePoolName)
 	}
 
diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go
index 79a0a9c0a2a..921c876c337 100644
--- a/master/internal/rm/agentrm/agent_state_test.go
+++ b/master/internal/rm/agentrm/agent_state_test.go
@@ -70,7 +70,7 @@ func TestAgentStatePersistence(t *testing.T) {
 		Version:              "",
 		Devices:              devices,
 		ContainersReattached: []aproto.ContainerReattachAck{},
-		ResourcePoolName:     "",
+		ResourcePoolName:     defaultResourcePoolName,
 	}
 	state.agentStarted(started)
 	require.Len(t, state.getSlotsSummary("/myagent"), 2)
@@ -340,36 +340,18 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
 			},
 			wantErrContains: "",
 		},
-		{
-			name: "resource pool name match",
-			state: agentState{
-				resourcePoolName: "default",
-			},
-			agentStarted: &aproto.AgentStarted{
-				ResourcePoolName: "",
-			},
-			wantErrContains: "",
-		},
 		{
 			name: "resource pool name is missing",
 			state: agentState{
 				resourcePoolName: "pool1",
 			},
-			agentStarted:    &aproto.AgentStarted{ResourcePoolName: ""},
-			wantErrContains: "resource pool has changed",
-		},
-		{
-			name: "resource pool name is missing",
-			state: agentState{
-				resourcePoolName: "default",
-			},
-			agentStarted:    &aproto.AgentStarted{ResourcePoolName: "pool1"},
+			agentStarted:    &aproto.AgentStarted{ResourcePoolName: defaultResourcePoolName},
 			wantErrContains: "resource pool has changed",
 		},
 		{
 			name: "resource pool name is missing",
 			state: agentState{
-				resourcePoolName: "",
+				resourcePoolName: defaultResourcePoolName,
 			},
 			agentStarted:    &aproto.AgentStarted{ResourcePoolName: "pool1"},
 			wantErrContains: "resource pool has changed",
@@ -420,7 +402,7 @@ func TestSlotStates(t *testing.T) {
 		Version:              "",
 		Devices:              devices,
 		ContainersReattached: []aproto.ContainerReattachAck{},
-		ResourcePoolName:     "",
+		ResourcePoolName:     defaultResourcePoolName,
 	}
 	state.agentStarted(started)
 	slots := state.getSlotsSummary("/")

From cbf68842dd26c6a60295cfe47b4b8e7beeda5b57 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Sun, 18 Aug 2024 22:33:53 -0700
Subject: [PATCH 05/16] correct e2e test with new fixture

---
 .../double-reattach.devcluster.yaml           |  29 +----
 .../multi-resource-pools.devcluster.yaml      | 110 ++++++++++++++++++
 agent/cmd/determined-agent/run_test.go        |   4 +
 agent/internal/options/options_test.go        |   1 +
 e2e_tests/tests/cluster/conftest.py           |   2 +
 e2e_tests/tests/cluster/managed_cluster.py    |  29 ++++-
 .../tests/cluster/test_master_restart.py      |  20 ++--
 7 files changed, 156 insertions(+), 39 deletions(-)
 create mode 100644 .circleci/devcluster/multi-resource-pools.devcluster.yaml

diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml
index 59026d25f58..4aebab4c76b 100644
--- a/.circleci/devcluster/double-reattach.devcluster.yaml
+++ b/.circleci/devcluster/double-reattach.devcluster.yaml
@@ -43,10 +43,6 @@ stages:
             pool_name: default
             provider: null
             task_container_defaults: null
-          - pool_name: pool1
-            max_slots: 8
-            scheduler:
-              type: priority
         scim:
           enabled: true
           auth:
@@ -96,27 +92,4 @@ stages:
         agent_reconnect_attempts: 24
         agent_reconnect_backoff: 5
         container_auto_remove_disabled: true
-        artificial_slots: 4
-
-  - agent:
-      name: agent20 # Copy of agent1, but with different resource pool.
-      config_file:
-        master_host: 127.0.0.1
-        master_port: 8081
-        agent_id: agent1
-        container_master_host: $DOCKER_LOCALHOST
-        agent_reconnect_attempts: 24
-        agent_reconnect_backoff: 5
-        container_auto_remove_disabled: true
-        resource_pool: pool1
-  
-  - agent:
-      name: agent30 # Copy of agent1, but with empty(default) resource pool.
-      config_file:
-        master_host: 127.0.0.1
-        master_port: 8081
-        agent_id: agent1
-        container_master_host: $DOCKER_LOCALHOST
-        agent_reconnect_attempts: 24
-        agent_reconnect_backoff: 5
-        container_auto_remove_disabled: true
\ No newline at end of file
+        artificial_slots: 4
\ No newline at end of file
diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
new file mode 100644
index 00000000000..54986a82896
--- /dev/null
+++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
@@ -0,0 +1,110 @@
+stages:
+  - db:
+      name: db
+
+  - master:
+      pre:
+        - sh: make -C tools prep-root
+      config_file:
+        security:
+          initial_user_password: $INITIAL_USER_PASSWORD
+        db:
+          host: localhost
+          port: 5432
+          password: postgres
+          user: postgres
+          name: determined
+        __internal:
+          preemption_timeout: 60s
+        checkpoint_storage:
+          type: shared_fs
+          host_path: /tmp
+          storage_path: determined-cp
+        log:
+          level: debug
+        root: tools/build
+        cache: 
+          cache_dir: /tmp/determined-cache
+        launch_error: false
+        telemetry:
+          enabled: false
+        resource_manager:
+          default_aux_resource_pool: default
+          default_compute_resource_pool: default
+          scheduler:
+            fitting_policy: best
+            type: fair_share
+          type: agent
+        resource_pools:
+          - agent_reattach_enabled: true
+            agent_reconnect_wait: 25s
+            description: ''
+            max_aux_containers_per_agent: 100
+            pool_name: default
+            provider: null
+            task_container_defaults: null
+          - pool_name: pool1
+            max_slots: 8
+            scheduler:
+              type: priority
+        scim:
+          enabled: true
+          auth:
+            type: basic
+            username: determined
+            password: password
+
+  - custom:
+      name: proxy
+      cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"]
+      post:
+        - conncheck:
+            port: 8081
+
+  - agent:
+      name: agent1
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent1
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
+        hooks:
+          on_connection_lost: ["touch", "/tmp/agent1-connection-lost"]
+
+
+  - agent:
+      name: agent2
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent2
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
+
+  - agent:
+      name: agent10 # Copy of agent1, but with different resource pool.
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent1
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
+        resource_pool: pool1
+  
+  - agent:
+      name: agent20 # Copy of agent1, but with empty(default) resource pool.
+      config_file:
+        master_host: 127.0.0.1
+        master_port: 8081
+        agent_id: agent1
+        container_master_host: $DOCKER_LOCALHOST
+        agent_reconnect_attempts: 24
+        agent_reconnect_backoff: 5
+        container_auto_remove_disabled: true
\ No newline at end of file
diff --git a/agent/cmd/determined-agent/run_test.go b/agent/cmd/determined-agent/run_test.go
index 593d749a4c5..d9ebda1ad4b 100644
--- a/agent/cmd/determined-agent/run_test.go
+++ b/agent/cmd/determined-agent/run_test.go
@@ -20,6 +20,7 @@ const DefaultRawConfig = `
 log:
     level: trace
     color: true
+resource_pool: default
 slot_type: auto
 security:
     tls:
@@ -100,6 +101,7 @@ func TestMergeAgentConfigViaNewViper(t *testing.T) {
 log:
     level: trace
     color: true
+resource_pool: default
 slot_type: auto
 security:
     tls:
@@ -157,6 +159,7 @@ func TestMergeAgentConfigViaViperWithDefaults(t *testing.T) {
 log:
     level: trace
     color: true
+resource_pool: default
 slot_type: auto
 security:
     tls:
@@ -213,6 +216,7 @@ func TestMergeAgentConfigViaViperWithDefaultsEnvAndFlags(t *testing.T) {
 log:
     level: trace
     color: true
+resource_pool: default
 slot_type: auto
 security:
     tls:
diff --git a/agent/internal/options/options_test.go b/agent/internal/options/options_test.go
index 3c114a2f513..8bc6f70813d 100644
--- a/agent/internal/options/options_test.go
+++ b/agent/internal/options/options_test.go
@@ -58,6 +58,7 @@ log:
 log:
     level: trace
     color: true
+resource_pool: default
 slot_type: auto
 security:
     tls:
diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py
index 9b61e3a30d1..2b12723ca7a 100644
--- a/e2e_tests/tests/cluster/conftest.py
+++ b/e2e_tests/tests/cluster/conftest.py
@@ -6,6 +6,8 @@
     managed_cluster_session,
     managed_cluster_session_priority_scheduler,
     restartable_managed_cluster,
+    managed_cluster_multi_resource_pools,
+    managed_cluster_session_multi_resource_pools,
 )
 from .managed_cluster_k8s import k8s_managed_cluster  # noqa
 from .managed_slurm_cluster import (  # noqa
diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py
index ea7a99fa592..c6c7c68632b 100644
--- a/e2e_tests/tests/cluster/managed_cluster.py
+++ b/e2e_tests/tests/cluster/managed_cluster.py
@@ -15,6 +15,9 @@
 DEVCLUSTER_REATTACH_OFF_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "double.devcluster.yaml"
 DEVCLUSTER_REATTACH_ON_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "double-reattach.devcluster.yaml"
 DEVCLUSTER_PRIORITY_SCHEDULER_CONFIG_PATH = DEVCLUSTER_CONFIG_ROOT_PATH / "priority.devcluster.yaml"
+DEVCLUSTER_MULTI_RP_CONFIG_PATH = (
+    DEVCLUSTER_CONFIG_ROOT_PATH / "multi-resource-pools.devcluster.yaml"
+)
 
 
 def get_agent_data(sess: api.Session) -> List[Dict[str, Any]]:
@@ -184,7 +187,7 @@ def managed_cluster_restarts(
     managed_cluster_session: ManagedCluster, request: Any
 ) -> Iterator[ManagedCluster]:  # check if priority scheduler or not using config.
     config = str(DEVCLUSTER_REATTACH_ON_CONFIG_PATH)
-    # port number is same for both reattach on and off config files so you can use either.
+    # port number is same for both reattach on and off config files or multi rp so you can use any.
     utils.set_master_port(config)
     nodeid = request.node.nodeid
     managed_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n")
@@ -204,3 +207,27 @@ def restartable_managed_cluster(
         managed_cluster_restarts.restart_master()
         managed_cluster_restarts.restart_agent()
         raise
+
+
+@pytest.fixture(scope="session")
+def managed_cluster_session_multi_resource_pools(request: Any) -> Iterator[ManagedCluster]:
+    config = str(DEVCLUSTER_MULTI_RP_CONFIG_PATH)
+    with ManagedCluster(config) as mc:
+        mc.initial_startup()
+        yield mc
+
+
+@pytest.fixture
+def managed_cluster_multi_resource_pools(
+    managed_cluster_session_multi_resource_pools: ManagedCluster, request: Any
+) -> Iterator[ManagedCluster]:
+    config = str(DEVCLUSTER_MULTI_RP_CONFIG_PATH)
+    utils.set_master_port(config)
+    nodeid = request.node.nodeid
+    managed_cluster_session_multi_resource_pools.log_marker(
+        f"pytest [{utils.now_ts()}] {nodeid} setup\n"
+    )
+    yield managed_cluster_session_multi_resource_pools
+    managed_cluster_session_multi_resource_pools.log_marker(
+        f"pytest [{utils.now_ts()}] {nodeid} teardown\n"
+    )
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 294f04fd41b..3fdac2e24f0 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -760,12 +760,12 @@ def test_master_restart_with_queued(
 
 @pytest.mark.managed_devcluster
 def test_agent_resource_pool_change(
-    restartable_managed_cluster: managed_cluster.ManagedCluster,
+    managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
-        restartable_managed_cluster.kill_agent()
-        restartable_managed_cluster.dc.restart_stage("agent20")
+        managed_cluster_multi_resource_pools.kill_agent()
+        managed_cluster_multi_resource_pools.dc.restart_stage("agent10")
 
         for _i in range(5):
             agent_data = managed_cluster.get_agent_data(admin)
@@ -777,18 +777,18 @@ def test_agent_resource_pool_change(
                 f"agent with different resource pool is still present after {_i} ticks:{agent_data}"
             )
     finally:
-        restartable_managed_cluster.dc.kill_stage("agent20")
-        restartable_managed_cluster.restart_agent()
+        managed_cluster_multi_resource_pools.dc.kill_stage("agent10")
+        managed_cluster_multi_resource_pools.restart_agent()
 
 
 @pytest.mark.managed_devcluster
 def test_agent_resource_pool_unchanged(
-    restartable_managed_cluster: managed_cluster.ManagedCluster,
+    managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
-        restartable_managed_cluster.kill_agent()
-        restartable_managed_cluster.dc.restart_stage("agent30")
+        managed_cluster_multi_resource_pools.kill_agent()
+        managed_cluster_multi_resource_pools.dc.restart_stage("agent20")
 
         for _i in range(5):
             agent_data = managed_cluster.get_agent_data(admin)
@@ -797,5 +797,5 @@ def test_agent_resource_pool_unchanged(
                 # which is not expected.
                 pytest.fail("agent exploded even with the same resource pool")
     finally:
-        restartable_managed_cluster.dc.kill_stage("agent30")
-        restartable_managed_cluster.restart_agent()
+        managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
+        managed_cluster_multi_resource_pools.restart_agent()

From 47bea5b183385c00f1250c8d8170f20de4538d44 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Sun, 18 Aug 2024 22:33:53 -0700
Subject: [PATCH 06/16] correct e2e test with new fixture

---
 e2e_tests/tests/cluster/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py
index 2b12723ca7a..bbaba0fbc31 100644
--- a/e2e_tests/tests/cluster/conftest.py
+++ b/e2e_tests/tests/cluster/conftest.py
@@ -1,9 +1,11 @@
 """Makes managed_cluster fixtures available to all files in the directory"""
 
 from .managed_cluster import (  # noqa
+    managed_cluster_multi_resource_pools,
     managed_cluster_priority_scheduler,
     managed_cluster_restarts,
     managed_cluster_session,
+    managed_cluster_session_multi_resource_pools,
     managed_cluster_session_priority_scheduler,
     restartable_managed_cluster,
     managed_cluster_multi_resource_pools,

From d93bb752648015ed91a51b099271a65749f06e8c Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Sun, 18 Aug 2024 22:48:44 -0700
Subject: [PATCH 07/16] Sort in conftest.py

---
 e2e_tests/tests/cluster/conftest.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py
index bbaba0fbc31..2dc82af4d25 100644
--- a/e2e_tests/tests/cluster/conftest.py
+++ b/e2e_tests/tests/cluster/conftest.py
@@ -8,8 +8,6 @@
     managed_cluster_session_multi_resource_pools,
     managed_cluster_session_priority_scheduler,
     restartable_managed_cluster,
-    managed_cluster_multi_resource_pools,
-    managed_cluster_session_multi_resource_pools,
 )
 from .managed_cluster_k8s import k8s_managed_cluster  # noqa
 from .managed_slurm_cluster import (  # noqa

From 556220b9c90e5212af54156f4de18b8e316f78fa Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Mon, 19 Aug 2024 00:08:36 -0700
Subject: [PATCH 08/16] create restartable managed cluster

---
 e2e_tests/tests/cluster/conftest.py           |  1 +
 e2e_tests/tests/cluster/managed_cluster.py    | 14 +++++++++++++
 .../tests/cluster/test_master_restart.py      | 20 +++++++++----------
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/e2e_tests/tests/cluster/conftest.py b/e2e_tests/tests/cluster/conftest.py
index 2dc82af4d25..ba545dd0546 100644
--- a/e2e_tests/tests/cluster/conftest.py
+++ b/e2e_tests/tests/cluster/conftest.py
@@ -8,6 +8,7 @@
     managed_cluster_session_multi_resource_pools,
     managed_cluster_session_priority_scheduler,
     restartable_managed_cluster,
+    restartable_managed_cluster_multi_resource_pools,
 )
 from .managed_cluster_k8s import k8s_managed_cluster  # noqa
 from .managed_slurm_cluster import (  # noqa
diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py
index c6c7c68632b..d62275c7ded 100644
--- a/e2e_tests/tests/cluster/managed_cluster.py
+++ b/e2e_tests/tests/cluster/managed_cluster.py
@@ -231,3 +231,17 @@ def managed_cluster_multi_resource_pools(
     managed_cluster_session_multi_resource_pools.log_marker(
         f"pytest [{utils.now_ts()}] {nodeid} teardown\n"
     )
+
+
+@pytest.fixture
+def restartable_managed_cluster_multi_resource_pools(
+    managed_cluster_multi_resource_pools: ManagedCluster,
+) -> Iterator[ManagedCluster]:
+    managed_cluster_multi_resource_pools.wait_for_agent_ok(20)
+    try:
+        yield managed_cluster_multi_resource_pools
+        managed_cluster_multi_resource_pools.wait_for_agent_ok(20)
+    except Exception:
+        managed_cluster_multi_resource_pools.restart_master()
+        managed_cluster_multi_resource_pools.restart_agent()
+        raise
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 3fdac2e24f0..053a38f8b1d 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -760,12 +760,12 @@ def test_master_restart_with_queued(
 
 @pytest.mark.managed_devcluster
 def test_agent_resource_pool_change(
-    managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
-        managed_cluster_multi_resource_pools.kill_agent()
-        managed_cluster_multi_resource_pools.dc.restart_stage("agent10")
+        restartable_managed_cluster_multi_resource_pools.kill_agent()
+        restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10")
 
         for _i in range(5):
             agent_data = managed_cluster.get_agent_data(admin)
@@ -777,18 +777,18 @@ def test_agent_resource_pool_change(
                 f"agent with different resource pool is still present after {_i} ticks:{agent_data}"
             )
     finally:
-        managed_cluster_multi_resource_pools.dc.kill_stage("agent10")
-        managed_cluster_multi_resource_pools.restart_agent()
+        restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10")
+        restartable_managed_cluster_multi_resource_pools.restart_agent()
 
 
 @pytest.mark.managed_devcluster
 def test_agent_resource_pool_unchanged(
-    managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
-        managed_cluster_multi_resource_pools.kill_agent()
-        managed_cluster_multi_resource_pools.dc.restart_stage("agent20")
+        restartable_managed_cluster_multi_resource_pools.kill_agent()
+        restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20")
 
         for _i in range(5):
             agent_data = managed_cluster.get_agent_data(admin)
@@ -797,5 +797,5 @@ def test_agent_resource_pool_unchanged(
                 # which is not expected.
                 pytest.fail("agent exploded even with the same resource pool")
     finally:
-        managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
-        managed_cluster_multi_resource_pools.restart_agent()
+        restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
+        restartable_managed_cluster_multi_resource_pools.restart_agent()

From b3eb2d026f5bb1c18124aa35b84975131978477d Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Mon, 19 Aug 2024 10:57:06 -0700
Subject: [PATCH 09/16] change port for connection

---
 .../multi-resource-pools.devcluster.yaml          | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
index 54986a82896..42effbd2427 100644
--- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml
+++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
@@ -1,6 +1,7 @@
 stages:
   - db:
       name: db
+      port: 5436
 
   - master:
       pre:
@@ -10,7 +11,7 @@ stages:
           initial_user_password: $INITIAL_USER_PASSWORD
         db:
           host: localhost
-          port: 5432
+          port: 5436
           password: postgres
           user: postgres
           name: determined
@@ -56,16 +57,16 @@ stages:
 
   - custom:
       name: proxy
-      cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"]
+      cmd: ["socat", "-d", "-d", "TCP-LISTEN:8083,reuseaddr,fork", "TCP:localhost:8080"]
       post:
         - conncheck:
-            port: 8081
+            port: 8083
 
   - agent:
       name: agent1
       config_file:
         master_host: 127.0.0.1
-        master_port: 8081
+        master_port: 8083
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -79,7 +80,7 @@ stages:
       name: agent2
       config_file:
         master_host: 127.0.0.1
-        master_port: 8081
+        master_port: 8083
         agent_id: agent2
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -90,7 +91,7 @@ stages:
       name: agent10 # Copy of agent1, but with different resource pool.
       config_file:
         master_host: 127.0.0.1
-        master_port: 8081
+        master_port: 8083
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -102,7 +103,7 @@ stages:
       name: agent20 # Copy of agent1, but with empty(default) resource pool.
       config_file:
         master_host: 127.0.0.1
-        master_port: 8081
+        master_port: 8083
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24

From 69920f5e714a1635d09b79e6a03617b66fc4e892 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 21 Aug 2024 21:32:16 -0700
Subject: [PATCH 10/16] Test new mark

---
 .circleci/real_config.yml                     | 14 +++++
 e2e_tests/pytest.ini                          |  1 +
 .../tests/cluster/test_master_restart.py      | 56 +++++++++++++++++--
 e2e_tests/tests/conftest.py                   |  1 +
 4 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
index d31bf390743..a9959a76190 100644
--- a/.circleci/real_config.yml
+++ b/.circleci/real_config.yml
@@ -4512,6 +4512,20 @@ workflows:
           extra-pytest-flags: "--no-compare-stats"
           collect-det-job-logs: false
 
+      - test-e2e:
+          name: test-e2e-managed-devcluster-resource-pools
+          context:
+            - dev-ci-cluster-default-user-credentials
+          requires:
+            - build-go-ee
+          parallelism: 4
+          resource-class: large
+          tf2: true
+          mark: managed_devcluster_resource_pools
+          managed-devcluster: true
+          extra-pytest-flags: "--no-compare-stats"
+          collect-det-job-logs: false
+
       - test-e2e:
           name: test-e2e-multi-k8s
           context:
diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
index 44c0c56e259..153cecd22ff 100644
--- a/e2e_tests/pytest.ini
+++ b/e2e_tests/pytest.ini
@@ -37,6 +37,7 @@ markers =
     nightly: nightly tests
     det_deploy_local: test det deploy local
     managed_devcluster: cluster tests that require a pytest-side managed cluster
+    managed_devcluster_resource_pools: cluster tests that require a pytest-side managed cluster with multiple resource pools
     port_registry: tests for port registry and unique port offset
 
     distributed_quarantine: distributed training tests (quarantine)
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 053a38f8b1d..254c8e38635 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -14,6 +14,7 @@
 from tests import detproc
 from tests import experiment as exp
 from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
+# from tests.cluster import managed_devcluster_resource_pools
 from tests.task import task
 
 logger = logging.getLogger(__name__)
@@ -758,9 +759,10 @@ def test_master_restart_with_queued(
         utils.assert_command_succeeded(sess, cmd_id)
 
 
-@pytest.mark.managed_devcluster
+@pytest.mark.managed_devcluster_resource_pools
 def test_agent_resource_pool_change(
-    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools:
+    managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
@@ -781,9 +783,10 @@ def test_agent_resource_pool_change(
         restartable_managed_cluster_multi_resource_pools.restart_agent()
 
 
-@pytest.mark.managed_devcluster
+@pytest.mark.managed_devcluster_resource_pools
 def test_agent_resource_pool_unchanged(
-    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools:
+    managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
@@ -799,3 +802,48 @@ def test_agent_resource_pool_unchanged(
     finally:
         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
         restartable_managed_cluster_multi_resource_pools.restart_agent()
+
+
+# @pytest.mark.managed_devcluster_resource_pools
+# def test_agent_resource_pool_change(
+#     restartable_managed_cluster_multi_resource_pools:
+#     managed_devcluster_resource_pools.ManagedDevclusterResourcePools,
+# ) -> None:
+#     admin = api_utils.admin_session()
+#     try:
+#         restartable_managed_cluster_multi_resource_pools.kill_agent()
+#         restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10")
+
+#         for _i in range(5):
+#             agent_data = managed_cluster.get_agent_data(admin)
+#             if len(agent_data) == 0:
+#                 # Agent has exploded and been wiped due to resource pool mismatch, as expected.
+#                 break
+#         else:
+#             pytest.fail(
+#                 f"agent with different resource pool is still present after {_i} ticks:{agent_data}"
+#             )
+#     finally:
+#         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10")
+#         restartable_managed_cluster_multi_resource_pools.restart_agent()
+
+
+# @pytest.mark.managed_devcluster_resource_pools
+# def test_agent_resource_pool_unchanged(
+#     restartable_managed_cluster_multi_resource_pools:
+#     managed_devcluster_resource_pools.ManagedDevclusterResourcePools,
+# ) -> None:
+#     admin = api_utils.admin_session()
+#     try:
+#         restartable_managed_cluster_multi_resource_pools.kill_agent()
+#         restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20")
+
+#         for _i in range(5):
+#             agent_data = managed_cluster.get_agent_data(admin)
+#             if len(agent_data) == 0:
+#                 # Agent has exploded and been wiped due to resource pool mismatch,
+#                 # which is not expected.
+#                 pytest.fail("agent exploded even with the same resource pool")
+#     finally:
+#         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
+#         restartable_managed_cluster_multi_resource_pools.restart_agent()
diff --git a/e2e_tests/tests/conftest.py b/e2e_tests/tests/conftest.py
index 0c8a2498ce4..d0c96bf53d3 100644
--- a/e2e_tests/tests/conftest.py
+++ b/e2e_tests/tests/conftest.py
@@ -49,6 +49,7 @@
     "model_hub_mmdetection",
     "deepspeed",
     "managed_devcluster",
+    "managed_devcluster_resource_pools",
     "port_registry",
     "distributed_quarantine",
     "det_deploy_local_quarantine",

From 27825d2410ad7e8887190c8c3932997d3f8fc635 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 21 Aug 2024 22:16:42 -0700
Subject: [PATCH 11/16] remove comments/unused lines

---
 .../multi-resource-pools.devcluster.yaml      | 16 +++---
 .../tests/cluster/test_master_restart.py      | 52 +------------------
 2 files changed, 10 insertions(+), 58 deletions(-)

diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
index 42effbd2427..833dc58d34b 100644
--- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml
+++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
@@ -1,7 +1,7 @@
 stages:
   - db:
       name: db
-      port: 5436
+      port: 5434
 
   - master:
       pre:
@@ -11,7 +11,7 @@ stages:
           initial_user_password: $INITIAL_USER_PASSWORD
         db:
           host: localhost
-          port: 5436
+          port: 5434
           password: postgres
           user: postgres
           name: determined
@@ -57,16 +57,16 @@ stages:
 
   - custom:
       name: proxy
-      cmd: ["socat", "-d", "-d", "TCP-LISTEN:8083,reuseaddr,fork", "TCP:localhost:8080"]
+      cmd: ["socat", "-d", "-d", "TCP-LISTEN:8081,reuseaddr,fork", "TCP:localhost:8080"]
       post:
         - conncheck:
-            port: 8083
+            port: 8081
 
   - agent:
       name: agent1
       config_file:
         master_host: 127.0.0.1
-        master_port: 8083
+        master_port: 8081
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -80,7 +80,7 @@ stages:
       name: agent2
       config_file:
         master_host: 127.0.0.1
-        master_port: 8083
+        master_port: 8081
         agent_id: agent2
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -91,7 +91,7 @@ stages:
       name: agent10 # Copy of agent1, but with different resource pool.
       config_file:
         master_host: 127.0.0.1
-        master_port: 8083
+        master_port: 8081
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
@@ -103,7 +103,7 @@ stages:
       name: agent20 # Copy of agent1, but with empty(default) resource pool.
       config_file:
         master_host: 127.0.0.1
-        master_port: 8083
+        master_port: 8081
         agent_id: agent1
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
diff --git a/e2e_tests/tests/cluster/test_master_restart.py b/e2e_tests/tests/cluster/test_master_restart.py
index 254c8e38635..8e95025cdea 100644
--- a/e2e_tests/tests/cluster/test_master_restart.py
+++ b/e2e_tests/tests/cluster/test_master_restart.py
@@ -14,7 +14,6 @@
 from tests import detproc
 from tests import experiment as exp
 from tests.cluster import abstract_cluster, managed_cluster, managed_cluster_k8s, utils
-# from tests.cluster import managed_devcluster_resource_pools
 from tests.task import task
 
 logger = logging.getLogger(__name__)
@@ -761,8 +760,7 @@ def test_master_restart_with_queued(
 
 @pytest.mark.managed_devcluster_resource_pools
 def test_agent_resource_pool_change(
-    restartable_managed_cluster_multi_resource_pools:
-    managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
@@ -785,8 +783,7 @@ def test_agent_resource_pool_change(
 
 @pytest.mark.managed_devcluster_resource_pools
 def test_agent_resource_pool_unchanged(
-    restartable_managed_cluster_multi_resource_pools:
-    managed_cluster.ManagedCluster,
+    restartable_managed_cluster_multi_resource_pools: managed_cluster.ManagedCluster,
 ) -> None:
     admin = api_utils.admin_session()
     try:
@@ -802,48 +799,3 @@ def test_agent_resource_pool_unchanged(
     finally:
         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
         restartable_managed_cluster_multi_resource_pools.restart_agent()
-
-
-# @pytest.mark.managed_devcluster_resource_pools
-# def test_agent_resource_pool_change(
-#     restartable_managed_cluster_multi_resource_pools:
-#     managed_devcluster_resource_pools.ManagedDevclusterResourcePools,
-# ) -> None:
-#     admin = api_utils.admin_session()
-#     try:
-#         restartable_managed_cluster_multi_resource_pools.kill_agent()
-#         restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent10")
-
-#         for _i in range(5):
-#             agent_data = managed_cluster.get_agent_data(admin)
-#             if len(agent_data) == 0:
-#                 # Agent has exploded and been wiped due to resource pool mismatch, as expected.
-#                 break
-#         else:
-#             pytest.fail(
-#                 f"agent with different resource pool is still present after {_i} ticks:{agent_data}"
-#             )
-#     finally:
-#         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent10")
-#         restartable_managed_cluster_multi_resource_pools.restart_agent()
-
-
-# @pytest.mark.managed_devcluster_resource_pools
-# def test_agent_resource_pool_unchanged(
-#     restartable_managed_cluster_multi_resource_pools:
-#     managed_devcluster_resource_pools.ManagedDevclusterResourcePools,
-# ) -> None:
-#     admin = api_utils.admin_session()
-#     try:
-#         restartable_managed_cluster_multi_resource_pools.kill_agent()
-#         restartable_managed_cluster_multi_resource_pools.dc.restart_stage("agent20")
-
-#         for _i in range(5):
-#             agent_data = managed_cluster.get_agent_data(admin)
-#             if len(agent_data) == 0:
-#                 # Agent has exploded and been wiped due to resource pool mismatch,
-#                 # which is not expected.
-#                 pytest.fail("agent exploded even with the same resource pool")
-#     finally:
-#         restartable_managed_cluster_multi_resource_pools.dc.kill_stage("agent20")
-#         restartable_managed_cluster_multi_resource_pools.restart_agent()

From 178a0969038aa036a8742f2f58c3c2b4842ee7ad Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 21 Aug 2024 22:19:56 -0700
Subject: [PATCH 12/16] minor correction

---
 e2e_tests/tests/cluster/managed_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/e2e_tests/tests/cluster/managed_cluster.py b/e2e_tests/tests/cluster/managed_cluster.py
index d62275c7ded..c31484b6ef9 100644
--- a/e2e_tests/tests/cluster/managed_cluster.py
+++ b/e2e_tests/tests/cluster/managed_cluster.py
@@ -187,7 +187,7 @@ def managed_cluster_restarts(
     managed_cluster_session: ManagedCluster, request: Any
 ) -> Iterator[ManagedCluster]:  # check if priority scheduler or not using config.
     config = str(DEVCLUSTER_REATTACH_ON_CONFIG_PATH)
-    # port number is same for both reattach on and off config files or multi rp so you can use any.
+    # port number is same for both reattach on and off config files so you can use either.
     utils.set_master_port(config)
     nodeid = request.node.nodeid
     managed_cluster_session.log_marker(f"pytest [{utils.now_ts()}] {nodeid} setup\n")

From 4877be6feaea23fcec8477639ed3ff69c6c3c84d Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Tue, 27 Aug 2024 18:31:06 -0700
Subject: [PATCH 13/16] changes based on comments

---
 .../devcluster/double-reattach.devcluster.yaml   |  2 +-
 master/internal/rm/agentrm/agent.go              |  2 +-
 master/internal/rm/agentrm/agent_state_test.go   | 16 ++++++++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/.circleci/devcluster/double-reattach.devcluster.yaml b/.circleci/devcluster/double-reattach.devcluster.yaml
index 4aebab4c76b..a7111202fa8 100644
--- a/.circleci/devcluster/double-reattach.devcluster.yaml
+++ b/.circleci/devcluster/double-reattach.devcluster.yaml
@@ -92,4 +92,4 @@ stages:
         agent_reconnect_attempts: 24
         agent_reconnect_backoff: 5
         container_auto_remove_disabled: true
-        artificial_slots: 4
\ No newline at end of file
+        artificial_slots: 4
diff --git a/master/internal/rm/agentrm/agent.go b/master/internal/rm/agentrm/agent.go
index 7c762565e05..ac87d2f427c 100644
--- a/master/internal/rm/agentrm/agent.go
+++ b/master/internal/rm/agentrm/agent.go
@@ -623,7 +623,7 @@ func (a *agent) HandleIncomingWebsocketMessage(msg *aproto.MasterMessage) {
 			resourcePoolErr := a.agentState.checkAgentResourcePoolMatch(msg.AgentStarted)
 			if resourcePoolErr != nil {
 				a.syslog.WithError(resourcePoolErr).
-					Error("change in agent resource pool was detected")
+					Error("change in agent resource pool was detected during reconnect")
 				a.socket.Outbox <- aproto.AgentMessage{
 					AgentShutdown: &aproto.AgentShutdown{
 						ErrMsg: aproto.ErrAgentMustReconnect.Error(),
diff --git a/master/internal/rm/agentrm/agent_state_test.go b/master/internal/rm/agentrm/agent_state_test.go
index 921c876c337..217c44c29d7 100644
--- a/master/internal/rm/agentrm/agent_state_test.go
+++ b/master/internal/rm/agentrm/agent_state_test.go
@@ -324,6 +324,10 @@ func Test_agentState_checkAgentStartedDevicesMatch(t *testing.T) {
 }
 
 func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
+	const (
+		poolOne = "pool1"
+		poolTwo = "pool2"
+	)
 	tests := []struct {
 		name            string
 		state           agentState
@@ -333,17 +337,17 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
 		{
 			name: "resource pool name match",
 			state: agentState{
-				resourcePoolName: "pool1",
+				resourcePoolName: poolOne,
 			},
 			agentStarted: &aproto.AgentStarted{
-				ResourcePoolName: "pool1",
+				ResourcePoolName: poolOne,
 			},
 			wantErrContains: "",
 		},
 		{
 			name: "resource pool name is missing",
 			state: agentState{
-				resourcePoolName: "pool1",
+				resourcePoolName: poolOne,
 			},
 			agentStarted:    &aproto.AgentStarted{ResourcePoolName: defaultResourcePoolName},
 			wantErrContains: "resource pool has changed",
@@ -353,16 +357,16 @@ func Test_agentState_checkAgentResourcePoolMatch(t *testing.T) {
 			state: agentState{
 				resourcePoolName: defaultResourcePoolName,
 			},
-			agentStarted:    &aproto.AgentStarted{ResourcePoolName: "pool1"},
+			agentStarted:    &aproto.AgentStarted{ResourcePoolName: poolOne},
 			wantErrContains: "resource pool has changed",
 		},
 		{
 			name: "mismatched resource pool name",
 			state: agentState{
-				resourcePoolName: "pool1",
+				resourcePoolName: poolOne,
 			},
 			agentStarted: &aproto.AgentStarted{
-				ResourcePoolName: "pool2",
+				ResourcePoolName: poolTwo,
 			},
 			wantErrContains: "resource pool has changed",
 		},

From 534228125cfe71af1ad1bc9129eb22cd9ee4b85f Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Tue, 27 Aug 2024 18:32:27 -0700
Subject: [PATCH 14/16] add newline

---
 .circleci/devcluster/multi-resource-pools.devcluster.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/devcluster/multi-resource-pools.devcluster.yaml b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
index 833dc58d34b..5529e251148 100644
--- a/.circleci/devcluster/multi-resource-pools.devcluster.yaml
+++ b/.circleci/devcluster/multi-resource-pools.devcluster.yaml
@@ -108,4 +108,4 @@ stages:
         container_master_host: $DOCKER_LOCALHOST
         agent_reconnect_attempts: 24
         agent_reconnect_backoff: 5
-        container_auto_remove_disabled: true
\ No newline at end of file
+        container_auto_remove_disabled: true

From f329db484c1cd286e32858aaf5fb21adff762120 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 28 Aug 2024 12:38:08 -0700
Subject: [PATCH 15/16] add note

---
 docs/get-started/architecture/introduction.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/get-started/architecture/introduction.rst b/docs/get-started/architecture/introduction.rst
index 1a4585cb65d..bc13ae9022e 100644
--- a/docs/get-started/architecture/introduction.rst
+++ b/docs/get-started/architecture/introduction.rst
@@ -391,6 +391,11 @@ If you are using static resource pools and launching agents by hand, you will ne
 :ref:`agent configuration <agent-config-reference>` to specify which resource pool the agent should
 join.
 
+Note that to change an agent's assigned resource_pool after it has already joined one, you need to
+update the :ref:`agent configuration <agent-config-reference>`. Make sure to drain the agents
+properly before modifying the resource_pool. After making the changes, restart the agent to connect
+it to the new resource pool.
+
 Migrate to Resource Pools
 -------------------------
 

From 2cbf48f2443432304c6882a9b73544ac2a7ee0d2 Mon Sep 17 00:00:00 2001
From: ShreyaLnuHpe <shreya.lnu@hpe.com>
Date: Wed, 28 Aug 2024 13:10:08 -0700
Subject: [PATCH 16/16] correct the note

---
 docs/get-started/architecture/introduction.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/get-started/architecture/introduction.rst b/docs/get-started/architecture/introduction.rst
index bc13ae9022e..815e4fbd261 100644
--- a/docs/get-started/architecture/introduction.rst
+++ b/docs/get-started/architecture/introduction.rst
@@ -391,10 +391,10 @@ If you are using static resource pools and launching agents by hand, you will ne
 :ref:`agent configuration <agent-config-reference>` to specify which resource pool the agent should
 join.
 
-Note that to change an agent's assigned resource_pool after it has already joined one, you need to
-update the :ref:`agent configuration <agent-config-reference>`. Make sure to drain the agents
-properly before modifying the resource_pool. After making the changes, restart the agent to connect
-it to the new resource pool.
+To change the resource pool an agent is assigned to after it has already joined one, you need to
+update the :ref:`agent configuration <agent-config-reference>`. Before making this change, ensure
+the agents are properly drained. Once the configuration is updated, restart the agent to connect it
+to the new resource pool.
 
 Migrate to Resource Pools
 -------------------------