From 0a1d604198b455c75f949188384ae3d261c80672 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Mon, 13 Jan 2025 02:41:32 +0800 Subject: [PATCH 01/13] feat: add ValidateRayClusterSpec in raycluster webhook and add unit test --- .../apis/ray/v1/raycluster_webhook.go | 20 ++ .../apis/ray/v1/raycluster_webhook_test.go | 251 ++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 ray-operator/apis/ray/v1/raycluster_webhook_test.go diff --git a/ray-operator/apis/ray/v1/raycluster_webhook.go b/ray-operator/apis/ray/v1/raycluster_webhook.go index 6650ef9534..1f9ac2f902 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook.go @@ -19,6 +19,8 @@ var ( nameRegex, _ = regexp.Compile("^[a-z]([-a-z0-9]*[a-z0-9])?$") ) +const RayFTEnabledAnnotationKey = "ray.io/ft-enabled" + func (r *RayCluster) SetupWebhookWithManager(mgr ctrl.Manager) error { return ctrl.NewWebhookManagedBy(mgr). For(r). @@ -59,6 +61,10 @@ func (r *RayCluster) validateRayCluster() error { allErrs = append(allErrs, err) } + if err := r.ValidateRayClusterSpec(); err != nil { + allErrs = append(allErrs, err) + } + if len(allErrs) == 0 { return nil } @@ -87,3 +93,17 @@ func (r *RayCluster) validateWorkerGroups() *field.Error { return nil } + +func (r *RayCluster) ValidateRayClusterSpec() *field.Error { + if r.Annotations[RayFTEnabledAnnotationKey] == "false" && r.Spec.GcsFaultToleranceOptions != nil { + return field.Invalid(field.NewPath("spec").Child("gcsFaultToleranceOptions"), r.Spec.GcsFaultToleranceOptions, "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled") + } + if r.Annotations[RayFTEnabledAnnotationKey] != "true" && r.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env != nil { + for _, env := range r.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env { + if env.Name == "RAY_REDIS_ADDRESS" { + return field.Invalid(field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(0).Child("env"), env.Name, "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled") + } + } + } + return nil +} diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go new file mode 100644 index 0000000000..a4e1f9dc99 --- /dev/null +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -0,0 +1,251 @@ +package v1 + +import ( + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/validation/field" +) + +func TestValidateRayClusterSpec(t *testing.T) { + tests := []struct { + gcsFaultToleranceOptions *GcsFaultToleranceOptions + name string + annotations map[string]string + envVars []corev1.EnvVar + errorMessage string + expectError bool + }{ + { + name: "FT disabled with GcsFaultToleranceOptions set", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + expectError: true, + errorMessage: "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled", + }, + { + name: "FT disabled with RAY_REDIS_ADDRESS set", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + envVars: []corev1.EnvVar{ + { + Name: "RAY_REDIS_ADDRESS", + Value: "redis://127.0.0.1:6379", + }, + }, + expectError: true, + errorMessage: "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled", + }, + { + name: "FT not set with RAY_REDIS_ADDRESS set", + annotations: map[string]string{}, + envVars: []corev1.EnvVar{ + { + Name: "RAY_REDIS_ADDRESS", + Value: "redis://127.0.0.1:6379", + }, + }, + expectError: true, + errorMessage: "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled", + }, + { + name: "FT disabled with other environment variables set", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + envVars: []corev1.EnvVar{ + { + Name: "SOME_OTHER_ENV", + Value: "some-value", + }, + }, + expectError: false, + }, + { + name: "FT enabled, GcsFaultToleranceOptions not nil", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{ + RedisAddress: "redis://127.0.0.1:6379", + }, + expectError: false, + }, + { + name: "FT enabled, GcsFaultToleranceOptions is nil", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + expectError: false, + }, + { + name: "FT enabled with with other environment variables set", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + envVars: []corev1.EnvVar{ + { + Name: "SOME_OTHER_ENV", + Value: "some-value", + }, + }, + expectError: false, + }, + { + name: "FT enabled with RAY_REDIS_ADDRESS set", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + envVars: []corev1.EnvVar{ + { + Name: "RAY_REDIS_ADDRESS", + Value: "redis://127.0.0.1:6379", + }, + }, + expectError: false, + }, + { + name: "FT disabled with no GcsFaultToleranceOptions and no RAY_REDIS_ADDRESS", + annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: tt.annotations, + }, + Spec: RayClusterSpec{ + GcsFaultToleranceOptions: tt.gcsFaultToleranceOptions, + HeadGroupSpec: HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "ray-head", + Env: tt.envVars, + }, + }, + }, + }, + }, + }, + } + err := r.ValidateRayClusterSpec() + if tt.expectError { + assert.NotNil(t, err) + assert.IsType(t, &field.Error{}, err) + assert.Equal(t, err.Detail, tt.errorMessage) + } else { + assert.Nil(t, err) + } + }) + } +} +func TestValidateRayCluster(t *testing.T) { + tests := []struct { + GcsFaultToleranceOptions *GcsFaultToleranceOptions + name string + ObjectMeta metav1.ObjectMeta + WorkerGroupSpecs []WorkerGroupSpec + expectError bool + errorMessage string + }{ + { + name: "Invalid name", + ObjectMeta: metav1.ObjectMeta{ + Name: "Invalid_Name", + }, + expectError: true, + errorMessage: "name must consist of lower case alphanumeric characters or '-', start with an alphabetic character, and end with an alphanumeric character", + }, + { + name: "Duplicate worker group names", + + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + }, + + WorkerGroupSpecs: []WorkerGroupSpec{ + {GroupName: "group1"}, + {GroupName: "group1"}, + }, + + expectError: true, + errorMessage: "worker group names must be unique", + }, + { + name: "FT disabled with GcsFaultToleranceOptions set", + + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + Annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + }, + GcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + expectError: true, + errorMessage: "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled", + }, + { + name: "Valid RayCluster", + + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + Annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + }, + GcsFaultToleranceOptions: &GcsFaultToleranceOptions{ + RedisAddress: "redis://127.0.0.1:6379", + }, + WorkerGroupSpecs: []WorkerGroupSpec{ + {GroupName: "group1"}, + {GroupName: "group2"}, + }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rayCluster := &RayCluster{ + ObjectMeta: tt.ObjectMeta, + Spec: RayClusterSpec{ + GcsFaultToleranceOptions: tt.GcsFaultToleranceOptions, + HeadGroupSpec: HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "ray-head", + }, + }, + }, + }, + }, + WorkerGroupSpecs: tt.WorkerGroupSpecs, + }, + } + err := rayCluster.validateRayCluster() + if tt.expectError { + assert.NotNil(t, err) + assert.IsType(t, &apierrors.StatusError{}, err) + statusErr := err.(*apierrors.StatusError) + assert.Contains(t, statusErr.ErrStatus.Details.Causes[0].Message, tt.errorMessage) + } else { + assert.Nil(t, err) + } + }) + } +} From f78dd697664709e1bc29e673934cecdbe7c03ff4 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Mon, 13 Jan 2025 02:58:04 +0800 Subject: [PATCH 02/13] fix: linter issue --- ray-operator/apis/ray/v1/raycluster_webhook_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go index a4e1f9dc99..c30f62dba9 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook_test.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -13,10 +13,10 @@ import ( func TestValidateRayClusterSpec(t *testing.T) { tests := []struct { gcsFaultToleranceOptions *GcsFaultToleranceOptions - name string annotations map[string]string - envVars []corev1.EnvVar + name string errorMessage string + envVars []corev1.EnvVar expectError bool }{ { @@ -152,14 +152,15 @@ func TestValidateRayClusterSpec(t *testing.T) { }) } } + func TestValidateRayCluster(t *testing.T) { tests := []struct { GcsFaultToleranceOptions *GcsFaultToleranceOptions name string + errorMessage string ObjectMeta metav1.ObjectMeta WorkerGroupSpecs []WorkerGroupSpec expectError bool - errorMessage string }{ { name: "Invalid name", @@ -241,8 +242,7 @@ func TestValidateRayCluster(t *testing.T) { if tt.expectError { assert.NotNil(t, err) assert.IsType(t, &apierrors.StatusError{}, err) - statusErr := err.(*apierrors.StatusError) - assert.Contains(t, statusErr.ErrStatus.Details.Causes[0].Message, tt.errorMessage) + assert.Contains(t, err.Error(), tt.errorMessage) } else { assert.Nil(t, err) } From 20f47ec958db5e73cd15bffe637296827fb579a0 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Mon, 13 Jan 2025 22:25:18 +0800 Subject: [PATCH 03/13] feat: add ValidateRayClusterSpec in raycluster webhook, copy necessary func and variables to /ray/v1 --- ray-operator/apis/ray/v1/constant.go | 10 ++++++++ .../apis/ray/v1/raycluster_webhook.go | 24 ++++++++++++------- .../apis/ray/v1/raycluster_webhook_test.go | 15 ++++++------ ray-operator/apis/ray/v1/utils.go | 12 ++++++++++ 4 files changed, 46 insertions(+), 15 deletions(-) create mode 100644 ray-operator/apis/ray/v1/constant.go create mode 100644 ray-operator/apis/ray/v1/utils.go diff --git a/ray-operator/apis/ray/v1/constant.go b/ray-operator/apis/ray/v1/constant.go new file mode 100644 index 0000000000..fedf81a91f --- /dev/null +++ b/ray-operator/apis/ray/v1/constant.go @@ -0,0 +1,10 @@ +package v1 + +// In KubeRay, the Ray container must be the first application container in a head or worker Pod. +const RayContainerIndex = 0 + +// Use as container env variable +const RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" + +// Ray GCS FT related annotations +const RayFTEnabledAnnotationKey = "ray.io/ft-enabled" diff --git a/ray-operator/apis/ray/v1/raycluster_webhook.go b/ray-operator/apis/ray/v1/raycluster_webhook.go index 1f9ac2f902..37c663136a 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook.go @@ -1,6 +1,7 @@ package v1 import ( + "fmt" "regexp" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -19,8 +20,6 @@ var ( nameRegex, _ = regexp.Compile("^[a-z]([-a-z0-9]*[a-z0-9])?$") ) -const RayFTEnabledAnnotationKey = "ray.io/ft-enabled" - func (r *RayCluster) SetupWebhookWithManager(mgr ctrl.Manager) error { return ctrl.NewWebhookManagedBy(mgr). For(r). @@ -96,13 +95,22 @@ func (r *RayCluster) validateWorkerGroups() *field.Error { func (r *RayCluster) ValidateRayClusterSpec() *field.Error { if r.Annotations[RayFTEnabledAnnotationKey] == "false" && r.Spec.GcsFaultToleranceOptions != nil { - return field.Invalid(field.NewPath("spec").Child("gcsFaultToleranceOptions"), r.Spec.GcsFaultToleranceOptions, "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled") + return field.Invalid( + field.NewPath("spec").Child("gcsFaultToleranceOptions"), + r.Spec.GcsFaultToleranceOptions, + fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), + ) } - if r.Annotations[RayFTEnabledAnnotationKey] != "true" && r.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env != nil { - for _, env := range r.Spec.HeadGroupSpec.Template.Spec.Containers[0].Env { - if env.Name == "RAY_REDIS_ADDRESS" { - return field.Invalid(field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(0).Child("env"), env.Name, "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled") - } + if r.Annotations[RayFTEnabledAnnotationKey] != "true" && + len(r.Spec.HeadGroupSpec.Template.Spec.Containers) > 0 && + r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env != nil { + + if EnvVarExists(RAY_REDIS_ADDRESS, r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) { + return field.Invalid( + field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(0).Child("env"), + RAY_REDIS_ADDRESS, + fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), + ) } } return nil diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go index c30f62dba9..f6150bc916 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook_test.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -1,6 +1,7 @@ package v1 import ( + "fmt" "testing" "github.com/stretchr/testify/assert" @@ -26,7 +27,7 @@ func TestValidateRayClusterSpec(t *testing.T) { }, gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, expectError: true, - errorMessage: "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled", + errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), }, { name: "FT disabled with RAY_REDIS_ADDRESS set", @@ -35,24 +36,24 @@ func TestValidateRayClusterSpec(t *testing.T) { }, envVars: []corev1.EnvVar{ { - Name: "RAY_REDIS_ADDRESS", + Name: RAY_REDIS_ADDRESS, Value: "redis://127.0.0.1:6379", }, }, expectError: true, - errorMessage: "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled", + errorMessage: fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), }, { name: "FT not set with RAY_REDIS_ADDRESS set", annotations: map[string]string{}, envVars: []corev1.EnvVar{ { - Name: "RAY_REDIS_ADDRESS", + Name: RAY_REDIS_ADDRESS, Value: "redis://127.0.0.1:6379", }, }, expectError: true, - errorMessage: "RAY_REDIS_ADDRESS should not be set when ray.io/ft-enabled is disabled", + errorMessage: fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), }, { name: "FT disabled with other environment variables set", @@ -104,7 +105,7 @@ func TestValidateRayClusterSpec(t *testing.T) { }, envVars: []corev1.EnvVar{ { - Name: "RAY_REDIS_ADDRESS", + Name: RAY_REDIS_ADDRESS, Value: "redis://127.0.0.1:6379", }, }, @@ -196,7 +197,7 @@ func TestValidateRayCluster(t *testing.T) { }, GcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, expectError: true, - errorMessage: "GcsFaultToleranceOptions should be nil when ray.io/ft-enabled is disabled", + errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), }, { name: "Valid RayCluster", diff --git a/ray-operator/apis/ray/v1/utils.go b/ray-operator/apis/ray/v1/utils.go new file mode 100644 index 0000000000..59dff80124 --- /dev/null +++ b/ray-operator/apis/ray/v1/utils.go @@ -0,0 +1,12 @@ +package v1 + +import corev1 "k8s.io/api/core/v1" + +func EnvVarExists(envName string, envVars []corev1.EnvVar) bool { + for _, env := range envVars { + if env.Name == envName { + return true + } + } + return false +} From f742941703308a2ec795b8f98011d2384da12541 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Tue, 14 Jan 2025 00:51:16 +0800 Subject: [PATCH 04/13] fix: reference duplicate func and constant variables --- ray-operator/apis/ray/v1/constant.go | 14 ++++++++------ ray-operator/controllers/ray/utils/constant.go | 12 ++++++++---- ray-operator/controllers/ray/utils/util.go | 9 +-------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/ray-operator/apis/ray/v1/constant.go b/ray-operator/apis/ray/v1/constant.go index fedf81a91f..66982cabac 100644 --- a/ray-operator/apis/ray/v1/constant.go +++ b/ray-operator/apis/ray/v1/constant.go @@ -1,10 +1,12 @@ package v1 -// In KubeRay, the Ray container must be the first application container in a head or worker Pod. -const RayContainerIndex = 0 +const ( + // In KubeRay, the Ray container must be the first application container in a head or worker Pod. + RayContainerIndex = 0 -// Use as container env variable -const RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" + // Use as container env variable + RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" -// Ray GCS FT related annotations -const RayFTEnabledAnnotationKey = "ray.io/ft-enabled" + // Ray GCS FT related annotations + RayFTEnabledAnnotationKey = "ray.io/ft-enabled" +) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index e9c97a6e36..a9cf23302e 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -1,6 +1,10 @@ package utils -import "errors" +import ( + "errors" + + rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" +) const ( @@ -28,7 +32,7 @@ const ( KubeRayVersion = "ray.io/kuberay-version" // In KubeRay, the Ray container must be the first application container in a head or worker Pod. - RayContainerIndex = 0 + RayContainerIndex = rayv1.RayContainerIndex // Batch scheduling labels // TODO(tgaddair): consider making these part of the CRD @@ -37,7 +41,7 @@ const ( RayClusterGangSchedulingEnabled = "ray.io/gang-scheduling-enabled" // Ray GCS FT related annotations - RayFTEnabledAnnotationKey = "ray.io/ft-enabled" + RayFTEnabledAnnotationKey = rayv1.RayFTEnabledAnnotationKey RayExternalStorageNSAnnotationKey = "ray.io/external-storage-namespace" // If this annotation is set to "true", the KubeRay operator will not modify the container's command. @@ -95,7 +99,7 @@ const ( FQ_RAY_IP = "FQ_RAY_IP" RAY_PORT = "RAY_PORT" RAY_ADDRESS = "RAY_ADDRESS" - RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" + RAY_REDIS_ADDRESS = rayv1.RAY_REDIS_ADDRESS REDIS_PASSWORD = "REDIS_PASSWORD" RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE = "RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE" RAY_EXTERNAL_STORAGE_NS = "RAY_external_storage_namespace" diff --git a/ray-operator/controllers/ray/utils/util.go b/ray-operator/controllers/ray/utils/util.go index 7ea1ba8185..10179cf45d 100644 --- a/ray-operator/controllers/ray/utils/util.go +++ b/ray-operator/controllers/ray/utils/util.go @@ -589,14 +589,7 @@ func IsJobFinished(j *batchv1.Job) (batchv1.JobConditionType, bool) { return "", false } -func EnvVarExists(envName string, envVars []corev1.EnvVar) bool { - for _, env := range envVars { - if env.Name == envName { - return true - } - } - return false -} +var EnvVarExists func(envName string, envVars []corev1.EnvVar) bool = rayv1.EnvVarExists func UpsertEnvVar(envVars []corev1.EnvVar, newEnvVar corev1.EnvVar) []corev1.EnvVar { overridden := false From ff87926773bd335854a3708a128525bfead98482 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Tue, 14 Jan 2025 23:37:49 +0800 Subject: [PATCH 05/13] fix: rm Containers[RayContainerIndex].Env check which is not necessary and change confusing message --- ray-operator/apis/ray/v1/raycluster_webhook.go | 7 ++----- ray-operator/apis/ray/v1/raycluster_webhook_test.go | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/ray-operator/apis/ray/v1/raycluster_webhook.go b/ray-operator/apis/ray/v1/raycluster_webhook.go index 37c663136a..3c4086e955 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook.go @@ -98,13 +98,10 @@ func (r *RayCluster) ValidateRayClusterSpec() *field.Error { return field.Invalid( field.NewPath("spec").Child("gcsFaultToleranceOptions"), r.Spec.GcsFaultToleranceOptions, - fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), + fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), ) } - if r.Annotations[RayFTEnabledAnnotationKey] != "true" && - len(r.Spec.HeadGroupSpec.Template.Spec.Containers) > 0 && - r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env != nil { - + if r.Annotations[RayFTEnabledAnnotationKey] != "true" && len(r.Spec.HeadGroupSpec.Template.Spec.Containers) > 0 { if EnvVarExists(RAY_REDIS_ADDRESS, r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) { return field.Invalid( field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(0).Child("env"), diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go index f6150bc916..a112e2cf1f 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook_test.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -27,7 +27,7 @@ func TestValidateRayClusterSpec(t *testing.T) { }, gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, expectError: true, - errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), + errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), }, { name: "FT disabled with RAY_REDIS_ADDRESS set", @@ -197,7 +197,7 @@ func TestValidateRayCluster(t *testing.T) { }, GcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, expectError: true, - errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s is disabled", RayFTEnabledAnnotationKey), + errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), }, { name: "Valid RayCluster", From f686791a6a708c69e87c9893d16a4f0555160459 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Wed, 15 Jan 2025 23:48:11 +0800 Subject: [PATCH 06/13] refactor: mv IsGCSFaultToleranceEnabled to rayv1 and reference to origin one --- ray-operator/apis/ray/v1/pod.go | 8 +++ ray-operator/apis/ray/v1/pod_test.go | 70 ++++++++++++++++++++++ ray-operator/controllers/ray/common/pod.go | 5 +- 3 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 ray-operator/apis/ray/v1/pod.go create mode 100644 ray-operator/apis/ray/v1/pod_test.go diff --git a/ray-operator/apis/ray/v1/pod.go b/ray-operator/apis/ray/v1/pod.go new file mode 100644 index 0000000000..c7ce4289c6 --- /dev/null +++ b/ray-operator/apis/ray/v1/pod.go @@ -0,0 +1,8 @@ +package v1 + +import "strings" + +func IsGCSFaultToleranceEnabled(instance RayCluster) bool { + v, ok := instance.Annotations[RayFTEnabledAnnotationKey] + return (ok && strings.ToLower(v) == "true") || instance.Spec.GcsFaultToleranceOptions != nil +} diff --git a/ray-operator/apis/ray/v1/pod_test.go b/ray-operator/apis/ray/v1/pod_test.go new file mode 100644 index 0000000000..68bf7f1786 --- /dev/null +++ b/ray-operator/apis/ray/v1/pod_test.go @@ -0,0 +1,70 @@ +package v1 + +import ( + "testing" + + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestIsGCSFaultToleranceEnabled(t *testing.T) { + tests := []struct { + name string + instance RayCluster + expected bool + }{ + { + name: "ray.io/ft-enabled is true", + instance: RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + RayFTEnabledAnnotationKey: "true", + }, + }, + }, + expected: true, + }, + { + name: "ray.io/ft-enabled is false", + instance: RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + RayFTEnabledAnnotationKey: "false", + }, + }, + }, + expected: false, + }, + { + name: "ray.io/ft-enabled is nil, GcsFaultToleranceOptions is not nil", + instance: RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{}, + }, + Spec: RayClusterSpec{ + GcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + }, + }, + expected: true, + }, + { + name: "ray.io/ft-enabled is nil, GcsFaultToleranceOptions is nil", + instance: RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{}, + }, + Spec: RayClusterSpec{ + GcsFaultToleranceOptions: nil, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := IsGCSFaultToleranceEnabled(tt.instance) + assert.Equal(t, tt.expected, result) + }) + } +} diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index facf340211..94b4d3baf8 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -56,10 +56,7 @@ func GetHeadPort(headStartParams map[string]string) string { } // Check if the RayCluster has GCS fault tolerance enabled. -func IsGCSFaultToleranceEnabled(instance rayv1.RayCluster) bool { - v, ok := instance.Annotations[utils.RayFTEnabledAnnotationKey] - return (ok && strings.ToLower(v) == "true") || instance.Spec.GcsFaultToleranceOptions != nil -} +var IsGCSFaultToleranceEnabled func(instance rayv1.RayCluster) bool = rayv1.IsGCSFaultToleranceEnabled // Check if overwrites the container command. func isOverwriteRayContainerCmd(instance rayv1.RayCluster) bool { From da307bbab98c200f11e1bf22ecbadefc7127ac0e Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Wed, 15 Jan 2025 23:50:00 +0800 Subject: [PATCH 07/13] refactor: mv utils constant REDIS_PASSWORD to rayv1 constant --- ray-operator/apis/ray/v1/constant.go | 2 +- ray-operator/controllers/ray/utils/constant.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/apis/ray/v1/constant.go b/ray-operator/apis/ray/v1/constant.go index 66982cabac..d8c25825d6 100644 --- a/ray-operator/apis/ray/v1/constant.go +++ b/ray-operator/apis/ray/v1/constant.go @@ -6,7 +6,7 @@ const ( // Use as container env variable RAY_REDIS_ADDRESS = "RAY_REDIS_ADDRESS" - + REDIS_PASSWORD = "REDIS_PASSWORD" // Ray GCS FT related annotations RayFTEnabledAnnotationKey = "ray.io/ft-enabled" ) diff --git a/ray-operator/controllers/ray/utils/constant.go b/ray-operator/controllers/ray/utils/constant.go index a9cf23302e..3fe775fce0 100644 --- a/ray-operator/controllers/ray/utils/constant.go +++ b/ray-operator/controllers/ray/utils/constant.go @@ -100,7 +100,7 @@ const ( RAY_PORT = "RAY_PORT" RAY_ADDRESS = "RAY_ADDRESS" RAY_REDIS_ADDRESS = rayv1.RAY_REDIS_ADDRESS - REDIS_PASSWORD = "REDIS_PASSWORD" + REDIS_PASSWORD = rayv1.REDIS_PASSWORD RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE = "RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE" RAY_EXTERNAL_STORAGE_NS = "RAY_external_storage_namespace" RAY_GCS_RPC_SERVER_RECONNECT_TIMEOUT_S = "RAY_gcs_rpc_server_reconnect_timeout_s" From 72c0e91c03bc5c44180b4cc94062230a84615791 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 00:47:49 +0800 Subject: [PATCH 08/13] fix: correct the test name in raycluster_controller_unit_test.go, change the logic which same as ValidateRayClusterSpec in raycluster_controller_unit_test.go and add additional unit test --- .../apis/ray/v1/raycluster_webhook.go | 61 ++- .../apis/ray/v1/raycluster_webhook_test.go | 377 ++++++++++++------ .../ray/raycluster_controller_unit_test.go | 2 +- 3 files changed, 317 insertions(+), 123 deletions(-) diff --git a/ray-operator/apis/ray/v1/raycluster_webhook.go b/ray-operator/apis/ray/v1/raycluster_webhook.go index 3c4086e955..91477ad13b 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook.go @@ -94,21 +94,66 @@ func (r *RayCluster) validateWorkerGroups() *field.Error { } func (r *RayCluster) ValidateRayClusterSpec() *field.Error { - if r.Annotations[RayFTEnabledAnnotationKey] == "false" && r.Spec.GcsFaultToleranceOptions != nil { + if len(r.Spec.HeadGroupSpec.Template.Spec.Containers) == 0 { return field.Invalid( - field.NewPath("spec").Child("gcsFaultToleranceOptions"), - r.Spec.GcsFaultToleranceOptions, - fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), + field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers"), + r.Spec.HeadGroupSpec.Template.Spec.Containers, + "headGroupSpec should have at least one container", ) } - if r.Annotations[RayFTEnabledAnnotationKey] != "true" && len(r.Spec.HeadGroupSpec.Template.Spec.Containers) > 0 { + + for _, workerGroup := range r.Spec.WorkerGroupSpecs { + if len(workerGroup.Template.Spec.Containers) == 0 { + return field.Invalid( + field.NewPath("spec").Child("workerGroupSpecs"), + r.Spec.WorkerGroupSpecs, + "workerGroupSpec should have at least one container", + ) + } + } + + if r.Annotations[RayFTEnabledAnnotationKey] != "" && r.Spec.GcsFaultToleranceOptions != nil { + return field.Invalid( + field.NewPath("metadata").Child("annotations").Child(RayFTEnabledAnnotationKey), + r.Annotations[RayFTEnabledAnnotationKey], + fmt.Sprintf("%s annotation and GcsFaultToleranceOptions are both set. "+ + "Please use only GcsFaultToleranceOptions to configure GCS fault tolerance", RayFTEnabledAnnotationKey), + ) + } + + if !IsGCSFaultToleranceEnabled(*r) { if EnvVarExists(RAY_REDIS_ADDRESS, r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) { return field.Invalid( - field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(0).Child("env"), - RAY_REDIS_ADDRESS, - fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), + field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(RayContainerIndex).Child("env"), + r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env, + fmt.Sprintf("%s is set which implicitly enables GCS fault tolerance, "+ + "but GcsFaultToleranceOptions is not set. Please set GcsFaultToleranceOptions "+ + "to enable GCS fault tolerance", RAY_REDIS_ADDRESS), ) } } + + if r.Spec.GcsFaultToleranceOptions != nil { + if redisPassword := r.Spec.HeadGroupSpec.RayStartParams["redis-password"]; redisPassword != "" { + return field.Invalid( + field.NewPath("spec").Child("headGroupSpec").Child("rayStartParams"), + r.Spec.HeadGroupSpec.RayStartParams, + "cannot set `redis-password` in rayStartParams when GcsFaultToleranceOptions is enabled - use GcsFaultToleranceOptions.RedisPassword instead", + ) + } + + headContainer := r.Spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex] + if EnvVarExists(REDIS_PASSWORD, headContainer.Env) { + return field.Invalid( + field.NewPath("spec").Child("headGroupSpec").Child("template").Child("spec").Child("containers").Index(RayContainerIndex).Child("env"), + headContainer.Env, + "cannot set `REDIS_PASSWORD` env var in head Pod when GcsFaultToleranceOptions is enabled - use GcsFaultToleranceOptions.RedisPassword instead", + ) + } + } + + // TODO (kevin85421): If GcsFaultToleranceOptions is set, users should use `GcsFaultToleranceOptions.RedisAddress` instead of `RAY_REDIS_ADDRESS`. + // TODO (kevin85421): If GcsFaultToleranceOptions is set, users should use `GcsFaultToleranceOptions.ExternalStorageNamespace` instead of + // the annotation `ray.io/external-storage-namespace`. return nil } diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go index a112e2cf1f..5a18da5752 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook_test.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -11,7 +11,13 @@ import ( "k8s.io/apimachinery/pkg/util/validation/field" ) -func TestValidateRayClusterSpec(t *testing.T) { +func TestValidateRayClusterSpecGcsFaultToleranceOptions(t *testing.T) { + errorMessageBothSet := fmt.Sprintf("%s annotation and GcsFaultToleranceOptions are both set. "+ + "Please use only GcsFaultToleranceOptions to configure GCS fault tolerance", RayFTEnabledAnnotationKey) + errorMessageRedisAddressSet := fmt.Sprintf("%s is set which implicitly enables GCS fault tolerance, "+ + "but GcsFaultToleranceOptions is not set. Please set GcsFaultToleranceOptions "+ + "to enable GCS fault tolerance", RAY_REDIS_ADDRESS) + tests := []struct { gcsFaultToleranceOptions *GcsFaultToleranceOptions annotations map[string]string @@ -20,109 +26,79 @@ func TestValidateRayClusterSpec(t *testing.T) { envVars []corev1.EnvVar expectError bool }{ + // GcsFaultToleranceOptions and ray.io/ft-enabled should not be both set. { - name: "FT disabled with GcsFaultToleranceOptions set", + name: "ray.io/ft-enabled is set to false and GcsFaultToleranceOptions is set", annotations: map[string]string{ RayFTEnabledAnnotationKey: "false", }, gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, expectError: true, - errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), + errorMessage: errorMessageBothSet, }, { - name: "FT disabled with RAY_REDIS_ADDRESS set", + name: "ray.io/ft-enabled is set to true and GcsFaultToleranceOptions is set", annotations: map[string]string{ - RayFTEnabledAnnotationKey: "false", - }, - envVars: []corev1.EnvVar{ - { - Name: RAY_REDIS_ADDRESS, - Value: "redis://127.0.0.1:6379", - }, + RayFTEnabledAnnotationKey: "true", }, - expectError: true, - errorMessage: fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + expectError: true, + errorMessage: errorMessageBothSet, }, { - name: "FT not set with RAY_REDIS_ADDRESS set", - annotations: map[string]string{}, - envVars: []corev1.EnvVar{ - { - Name: RAY_REDIS_ADDRESS, - Value: "redis://127.0.0.1:6379", - }, - }, - expectError: true, - errorMessage: fmt.Sprintf("%s should not be set when %s is disabled", RAY_REDIS_ADDRESS, RayFTEnabledAnnotationKey), + name: "ray.io/ft-enabled is not set and GcsFaultToleranceOptions is set", + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + expectError: false, }, { - name: "FT disabled with other environment variables set", + name: "ray.io/ft-enabled is not set and GcsFaultToleranceOptions is not set", + gcsFaultToleranceOptions: nil, + expectError: false, + }, + // RAY_REDIS_ADDRESS should not be set if KubeRay is not aware that GCS fault tolerance is enabled. + { + name: "ray.io/ft-enabled is set to false and RAY_REDIS_ADDRESS is set", annotations: map[string]string{ RayFTEnabledAnnotationKey: "false", }, envVars: []corev1.EnvVar{ { - Name: "SOME_OTHER_ENV", - Value: "some-value", + Name: RAY_REDIS_ADDRESS, + Value: "redis:6379", }, }, - expectError: false, - }, - { - name: "FT enabled, GcsFaultToleranceOptions not nil", - annotations: map[string]string{ - RayFTEnabledAnnotationKey: "true", - }, - gcsFaultToleranceOptions: &GcsFaultToleranceOptions{ - RedisAddress: "redis://127.0.0.1:6379", - }, - expectError: false, - }, - { - name: "FT enabled, GcsFaultToleranceOptions is nil", - annotations: map[string]string{ - RayFTEnabledAnnotationKey: "true", - }, - expectError: false, + expectError: true, + errorMessage: errorMessageRedisAddressSet, }, { - name: "FT enabled with with other environment variables set", - annotations: map[string]string{ - RayFTEnabledAnnotationKey: "true", - }, + name: "ray.io/ft-enabled is not set and RAY_REDIS_ADDRESS is set", envVars: []corev1.EnvVar{ { - Name: "SOME_OTHER_ENV", - Value: "some-value", + Name: RAY_REDIS_ADDRESS, + Value: "redis:6379", }, }, - expectError: false, + expectError: true, + errorMessage: errorMessageRedisAddressSet, }, { - name: "FT enabled with RAY_REDIS_ADDRESS set", + name: "ray.io/ft-enabled is set to true and RAY_REDIS_ADDRESS is set", annotations: map[string]string{ RayFTEnabledAnnotationKey: "true", }, envVars: []corev1.EnvVar{ { Name: RAY_REDIS_ADDRESS, - Value: "redis://127.0.0.1:6379", + Value: "redis:6379", }, }, expectError: false, }, - { - name: "FT disabled with no GcsFaultToleranceOptions and no RAY_REDIS_ADDRESS", - annotations: map[string]string{ - RayFTEnabledAnnotationKey: "false", - }, - expectError: false, - }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - r := &RayCluster{ + rayCluster := &RayCluster{ ObjectMeta: metav1.ObjectMeta{ Annotations: tt.annotations, }, @@ -133,8 +109,7 @@ func TestValidateRayClusterSpec(t *testing.T) { Spec: corev1.PodSpec{ Containers: []corev1.Container{ { - Name: "ray-head", - Env: tt.envVars, + Env: tt.envVars, }, }, }, @@ -142,7 +117,7 @@ func TestValidateRayClusterSpec(t *testing.T) { }, }, } - err := r.ValidateRayClusterSpec() + err := rayCluster.ValidateRayClusterSpec() if tt.expectError { assert.NotNil(t, err) assert.IsType(t, &field.Error{}, err) @@ -154,92 +129,266 @@ func TestValidateRayClusterSpec(t *testing.T) { } } -func TestValidateRayCluster(t *testing.T) { +func TestValidateRayClusterSpecRedisPassword(t *testing.T) { tests := []struct { - GcsFaultToleranceOptions *GcsFaultToleranceOptions + gcsFaultToleranceOptions *GcsFaultToleranceOptions name string - errorMessage string - ObjectMeta metav1.ObjectMeta - WorkerGroupSpecs []WorkerGroupSpec + rayStartParams map[string]string + envVars []corev1.EnvVar expectError bool }{ { - name: "Invalid name", - ObjectMeta: metav1.ObjectMeta{ - Name: "Invalid_Name", + name: "GcsFaultToleranceOptions is set and `redis-password` is also set in rayStartParams", + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + rayStartParams: map[string]string{ + "redis-password": "password", }, - expectError: true, - errorMessage: "name must consist of lower case alphanumeric characters or '-', start with an alphabetic character, and end with an alphanumeric character", + expectError: true, }, { - name: "Duplicate worker group names", - - ObjectMeta: metav1.ObjectMeta{ - Name: "valid-name", + name: "GcsFaultToleranceOptions is set and `REDIS_PASSWORD` env var is also set in the head Pod", + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, + envVars: []corev1.EnvVar{ + { + Name: REDIS_PASSWORD, + Value: "password", + }, + }, + expectError: true, + }, + { + name: "GcsFaultToleranceOptions.RedisPassword is set", + gcsFaultToleranceOptions: &GcsFaultToleranceOptions{ + RedisPassword: &RedisCredential{ + Value: "password", + }, }, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rayCluster := &RayCluster{ + Spec: RayClusterSpec{ + GcsFaultToleranceOptions: tt.gcsFaultToleranceOptions, + HeadGroupSpec: HeadGroupSpec{ + RayStartParams: tt.rayStartParams, + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Env: tt.envVars, + }, + }, + }, + }, + }, + }, + } + err := rayCluster.ValidateRayClusterSpec() + if tt.expectError { + assert.NotNil(t, err) + assert.IsType(t, &field.Error{}, err) + } else { + assert.Nil(t, err) + } + }) + } +} - WorkerGroupSpecs: []WorkerGroupSpec{ - {GroupName: "group1"}, - {GroupName: "group1"}, +func TestValidateRayClusterSpecEmptyContainers(t *testing.T) { + headGroupSpecWithOneContainer := HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "ray-head"}}, + }, + }, + } + workerGroupSpecWithOneContainer := WorkerGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "ray-worker"}}, }, + }, + } + headGroupSpecWithNoContainers := *headGroupSpecWithOneContainer.DeepCopy() + headGroupSpecWithNoContainers.Template.Spec.Containers = []corev1.Container{} + workerGroupSpecWithNoContainers := *workerGroupSpecWithOneContainer.DeepCopy() + workerGroupSpecWithNoContainers.Template.Spec.Containers = []corev1.Container{} + tests := []struct { + rayCluster *RayCluster + name string + errorMessage string + expectError bool + }{ + { + name: "headGroupSpec has no containers", + rayCluster: &RayCluster{ + Spec: RayClusterSpec{ + HeadGroupSpec: headGroupSpecWithNoContainers, + }, + }, expectError: true, - errorMessage: "worker group names must be unique", + errorMessage: "headGroupSpec should have at least one container", }, { - name: "FT disabled with GcsFaultToleranceOptions set", - - ObjectMeta: metav1.ObjectMeta{ - Name: "valid-name", - Annotations: map[string]string{ - RayFTEnabledAnnotationKey: "false", + name: "workerGroupSpec has no containers", + rayCluster: &RayCluster{ + Spec: RayClusterSpec{ + HeadGroupSpec: headGroupSpecWithOneContainer, + WorkerGroupSpecs: []WorkerGroupSpec{workerGroupSpecWithNoContainers}, }, }, - GcsFaultToleranceOptions: &GcsFaultToleranceOptions{}, - expectError: true, - errorMessage: fmt.Sprintf("GcsFaultToleranceOptions should be nil when %s annotation is set to false", RayFTEnabledAnnotationKey), + expectError: true, + errorMessage: "workerGroupSpec should have at least one container", }, { - name: "Valid RayCluster", - - ObjectMeta: metav1.ObjectMeta{ - Name: "valid-name", - Annotations: map[string]string{ - RayFTEnabledAnnotationKey: "true", + name: "valid cluster with containers in both head and worker groups", + rayCluster: &RayCluster{ + Spec: RayClusterSpec{ + HeadGroupSpec: headGroupSpecWithOneContainer, + WorkerGroupSpecs: []WorkerGroupSpec{workerGroupSpecWithOneContainer}, }, }, - GcsFaultToleranceOptions: &GcsFaultToleranceOptions{ - RedisAddress: "redis://127.0.0.1:6379", - }, - WorkerGroupSpecs: []WorkerGroupSpec{ - {GroupName: "group1"}, - {GroupName: "group2"}, - }, expectError: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - rayCluster := &RayCluster{ - ObjectMeta: tt.ObjectMeta, + err := tt.rayCluster.ValidateRayClusterSpec() + if tt.expectError { + assert.NotNil(t, err) + assert.IsType(t, &field.Error{}, err) + assert.Equal(t, err.Detail, tt.errorMessage) + } else { + assert.Nil(t, err) + } + }) + } +} + +func TestValidateRayCluster(t *testing.T) { + validHeadGroupSpec := HeadGroupSpec{ + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "ray-head"}, + }, + }, + }, + } + workerGroupSpec := WorkerGroupSpec{ + GroupName: "worker-group-1", + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "ray-worker"}, + }, + }, + }, + } + workerGroupSpecs := []WorkerGroupSpec{workerGroupSpec} + + tests := []struct { + name string + rayCluster *RayCluster + expectError bool + errorMessage string + }{ + { + name: "valid RayCluster", + rayCluster: &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + }, + Spec: RayClusterSpec{ + HeadGroupSpec: validHeadGroupSpec, + WorkerGroupSpecs: workerGroupSpecs, + }, + }, + expectError: false, + }, + { + name: "invalid rayCluster name", + rayCluster: &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "Invalid_Name", + }, + Spec: RayClusterSpec{ + HeadGroupSpec: validHeadGroupSpec, + WorkerGroupSpecs: workerGroupSpecs, + }, + }, + expectError: true, + errorMessage: "name must consist of lower case alphanumeric characters or '-', start with an alphabetic character, and end with an alphanumeric character", + }, + { + name: "duplicate worker group names", + rayCluster: &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + }, + Spec: RayClusterSpec{ + HeadGroupSpec: validHeadGroupSpec, + WorkerGroupSpecs: []WorkerGroupSpec{ + workerGroupSpec, + workerGroupSpec, + }, + }, + }, + expectError: true, + errorMessage: "worker group names must be unique", + }, + { + name: "headGroupSpec has no containers", + rayCluster: &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + }, Spec: RayClusterSpec{ - GcsFaultToleranceOptions: tt.GcsFaultToleranceOptions, HeadGroupSpec: HeadGroupSpec{ Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Name: "ray-head", - }, + Containers: []corev1.Container{}, + }, + }, + }, + WorkerGroupSpecs: workerGroupSpecs, + }, + }, + expectError: true, + errorMessage: "headGroupSpec should have at least one container", + }, + { + name: "workerGroupSpec has no containers", + rayCluster: &RayCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "valid-name", + }, + Spec: RayClusterSpec{ + HeadGroupSpec: validHeadGroupSpec, + WorkerGroupSpecs: []WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{}, }, }, }, }, - WorkerGroupSpecs: tt.WorkerGroupSpecs, }, - } - err := rayCluster.validateRayCluster() + }, + expectError: true, + errorMessage: "workerGroupSpec should have at least one container", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.rayCluster.validateRayCluster() if tt.expectError { assert.NotNil(t, err) assert.IsType(t, &apierrors.StatusError{}, err) diff --git a/ray-operator/controllers/ray/raycluster_controller_unit_test.go b/ray-operator/controllers/ray/raycluster_controller_unit_test.go index a9d6a5c316..5575f58793 100644 --- a/ray-operator/controllers/ray/raycluster_controller_unit_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_unit_test.go @@ -3533,7 +3533,7 @@ func TestValidateRayClusterSpecGcsFaultToleranceOptions(t *testing.T) { errorMessage: errorMessageRedisAddressSet, }, { - name: "FT is disabled and RAY_REDIS_ADDRESS is set", + name: "ray.io/ft-enabled is not set and RAY_REDIS_ADDRESS is set", envVars: []corev1.EnvVar{ { Name: utils.RAY_REDIS_ADDRESS, From 8ba5b91673fccaa7c17243d08b53509151fea00c Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 00:54:46 +0800 Subject: [PATCH 09/13] test: add unit test for EnvVarExists --- ray-operator/apis/ray/v1/utils_test.go | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 ray-operator/apis/ray/v1/utils_test.go diff --git a/ray-operator/apis/ray/v1/utils_test.go b/ray-operator/apis/ray/v1/utils_test.go new file mode 100644 index 0000000000..5358cfc904 --- /dev/null +++ b/ray-operator/apis/ray/v1/utils_test.go @@ -0,0 +1,50 @@ +package v1 + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" +) + +func TestEnvVarExists(t *testing.T) { + tests := []struct { + name string + envName string + envVars []corev1.EnvVar + expected bool + }{ + { + name: "env var exists", + envName: "EXISTING_ENV", + envVars: []corev1.EnvVar{ + {Name: "EXISTING_ENV", Value: "value1"}, + {Name: "ANOTHER_ENV", Value: "value2"}, + }, + expected: true, + }, + { + name: "env var does not exist", + envName: "NON_EXISTING_ENV", + envVars: []corev1.EnvVar{ + {Name: "EXISTING_ENV", Value: "value1"}, + {Name: "ANOTHER_ENV", Value: "value2"}, + }, + expected: false, + }, + { + name: "empty env vars", + envName: "ANY_ENV", + envVars: []corev1.EnvVar{}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := EnvVarExists(tt.envName, tt.envVars) + if result != tt.expected { + t.Errorf("EnvVarExists(%s, %v) = %v; expected %v", tt.envName, tt.envVars, result, tt.expected) + } + }) + } +} From b8c8fc03afb0f8bbcfe67c19938edcd4d8c886b9 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 00:55:59 +0800 Subject: [PATCH 10/13] fix: linter issue --- ray-operator/apis/ray/v1/raycluster_webhook_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/apis/ray/v1/raycluster_webhook_test.go b/ray-operator/apis/ray/v1/raycluster_webhook_test.go index 5a18da5752..475afbca5e 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook_test.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook_test.go @@ -293,10 +293,10 @@ func TestValidateRayCluster(t *testing.T) { workerGroupSpecs := []WorkerGroupSpec{workerGroupSpec} tests := []struct { - name string rayCluster *RayCluster - expectError bool + name string errorMessage string + expectError bool }{ { name: "valid RayCluster", From 6ef8e8cbc2f40f32e23325231df31a85b3cdd924 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 01:17:29 +0800 Subject: [PATCH 11/13] fix: add container in when name is invalid testcase --- ray-operator/apis/ray/v1/webhook_suite_test.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/ray-operator/apis/ray/v1/webhook_suite_test.go b/ray-operator/apis/ray/v1/webhook_suite_test.go index 52f4e10f40..26d1ba51ea 100644 --- a/ray-operator/apis/ray/v1/webhook_suite_test.go +++ b/ray-operator/apis/ray/v1/webhook_suite_test.go @@ -136,11 +136,24 @@ var _ = Describe("RayCluster validating webhook", func() { RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"}, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - Containers: []corev1.Container{}, + Containers: []corev1.Container{ + {Name: "ray-head"}, + }, + }, + }, + }, + WorkerGroupSpecs: []WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "ray-worker"}, + }, + }, }, }, }, - WorkerGroupSpecs: []WorkerGroupSpec{}, }, } From ed586728ef6b500615ad34056d7b2493494bb632 Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 01:44:03 +0800 Subject: [PATCH 12/13] fix: add RayStartParams in workerGroupSpec --- ray-operator/apis/ray/v1/webhook_suite_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ray-operator/apis/ray/v1/webhook_suite_test.go b/ray-operator/apis/ray/v1/webhook_suite_test.go index 26d1ba51ea..98f4554613 100644 --- a/ray-operator/apis/ray/v1/webhook_suite_test.go +++ b/ray-operator/apis/ray/v1/webhook_suite_test.go @@ -144,7 +144,8 @@ var _ = Describe("RayCluster validating webhook", func() { }, WorkerGroupSpecs: []WorkerGroupSpec{ { - GroupName: "worker-group-1", + GroupName: "worker-group-1", + RayStartParams: map[string]string{"DEADBEEF": "DEADBEEF"}, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ Containers: []corev1.Container{ From 41a0ac77c016a50014a1b481fe0e30a0bcedffae Mon Sep 17 00:00:00 2001 From: Cheyu Wu Date: Thu, 16 Jan 2025 23:27:49 +0800 Subject: [PATCH 13/13] fix workergroupSpec invalid info --- ray-operator/apis/ray/v1/raycluster_webhook.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ray-operator/apis/ray/v1/raycluster_webhook.go b/ray-operator/apis/ray/v1/raycluster_webhook.go index 91477ad13b..dc77aa125e 100644 --- a/ray-operator/apis/ray/v1/raycluster_webhook.go +++ b/ray-operator/apis/ray/v1/raycluster_webhook.go @@ -102,11 +102,11 @@ func (r *RayCluster) ValidateRayClusterSpec() *field.Error { ) } - for _, workerGroup := range r.Spec.WorkerGroupSpecs { + for i, workerGroup := range r.Spec.WorkerGroupSpecs { if len(workerGroup.Template.Spec.Containers) == 0 { return field.Invalid( - field.NewPath("spec").Child("workerGroupSpecs"), - r.Spec.WorkerGroupSpecs, + field.NewPath("spec").Child("workerGroupSpecs").Index(i), + workerGroup, "workerGroupSpec should have at least one container", ) }