From 6d332d7951c0d85e0c9551fbf35a685e1ff9f3f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edgar=20Hern=C3=A1ndez?= <23639005+israel-hdez@users.noreply.github.com> Date: Mon, 9 Sep 2024 14:52:20 -0600 Subject: [PATCH] Modelcar race condition mitigation with an init container MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the model container as an init-container to mitigate a race condition that would happen if the model container is not present on the cluster-node. The race condition happens if the cluster is able to fetch and start the runtime container before the modelcar is fetched. This would lead to the runtime to terminate with error. By configuring the model container as an init-container the runtime won't start until the modelcar is fetched. Although there is still the risk of a race condition when the cluster schedules the runtime container first, the pod should stabilize after a few restarts of the runtime container and should either prevent a CrashLoopBackOff event on the pod, or the crash event would finish quickly. This improves compatibility with the runtimes which can now stay agnostic to the modelcar implementation, until better techniques (like native sidecars, and oci volume mounts) become mature. Signed-off-by: Edgar Hernández <23639005+israel-hdez@users.noreply.github.com> --- .../pod/storage_initializer_injector.go | 42 +++++++++++++++++++ .../pod/storage_initializer_injector_test.go | 27 ++++++++++++ 2 files changed, 69 insertions(+) diff --git a/pkg/webhook/admission/pod/storage_initializer_injector.go b/pkg/webhook/admission/pod/storage_initializer_injector.go index f19dc5056c6..1af28320081 100644 --- a/pkg/webhook/admission/pod/storage_initializer_injector.go +++ b/pkg/webhook/admission/pod/storage_initializer_injector.go @@ -48,6 +48,7 @@ const ( PvcSourceMountPath = "/mnt/pvc" CaBundleVolumeName = "cabundle-cert" ModelcarContainerName = "modelcar" + ModelcarInitContainerName = "modelcar-init" ModelInitModeEnv = "MODEL_INIT_MODE" CpuModelcarDefault = "10m" MemoryModelcarDefault = "15Mi" @@ -172,6 +173,11 @@ func (mi *StorageInitializerInjector) InjectModelcar(pod *v1.Pod) error { if getContainerWithName(pod, ModelcarContainerName) == nil { modelContainer := mi.createModelContainer(image, constants.DefaultModelLocalMountPath) pod.Spec.Containers = append(pod.Spec.Containers, *modelContainer) + + // Add the model container as an init-container to pre-fetch the model before + // the runtimes starts. + modelInitContainer := mi.createModelInitContainer(image) + pod.Spec.InitContainers = append(pod.Spec.InitContainers, *modelInitContainer) } // Enable process namespace sharing so that the modelcar's root filesystem @@ -684,6 +690,42 @@ func (mi *StorageInitializerInjector) createModelContainer(image string, modelPa return modelContainer } +func (mi *StorageInitializerInjector) createModelInitContainer(image string) *v1.Container { + cpu := mi.config.CpuModelcar + if cpu == "" { + cpu = CpuModelcarDefault + } + memory := mi.config.MemoryModelcar + if memory == "" { + memory = MemoryModelcarDefault + } + + modelContainer := &v1.Container{ + Name: ModelcarInitContainerName, + Image: image, + Args: []string{ + "sh", + "-c", + // Check that the expected models directory exists + "echo 'Pre-fetching modelcar " + image + ": ' && [ -d /models ] && [ \"$$(ls -A /models)\" ] && echo 'OK ... Prefetched and valid (/models exists)' || (echo 'NOK ... Prefetched but modelcar is invalid (/models does not exist or is empty)' && exit 1)", + }, + Resources: v1.ResourceRequirements{ + Limits: map[v1.ResourceName]resource.Quantity{ + // Could possibly be reduced to even less + v1.ResourceCPU: resource.MustParse(cpu), + v1.ResourceMemory: resource.MustParse(memory), + }, + Requests: map[v1.ResourceName]resource.Quantity{ + v1.ResourceCPU: resource.MustParse(cpu), + v1.ResourceMemory: resource.MustParse(memory), + }, + }, + TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError, + } + + return modelContainer +} + // addEmptyDirVolumeIfNotPresent adds an emptyDir volume only if not present in the // list. pod and pod.Spec must not be nil func addEmptyDirVolumeIfNotPresent(pod *v1.Pod, name string) { diff --git a/pkg/webhook/admission/pod/storage_initializer_injector_test.go b/pkg/webhook/admission/pod/storage_initializer_injector_test.go index f9b48239c01..9256b193359 100644 --- a/pkg/webhook/admission/pod/storage_initializer_injector_test.go +++ b/pkg/webhook/admission/pod/storage_initializer_injector_test.go @@ -2723,6 +2723,33 @@ func TestInjectModelcar(t *testing.T) { t.Errorf("Expected two containers but got %d", len(pod.Spec.Containers)) } + // Check that an init container has been injected, and it is the model container + if len(pod.Spec.InitContainers) != 1 { + t.Errorf("Expected one init container but got %d", len(pod.Spec.InitContainers)) + } else if pod.Spec.InitContainers[0].Name != ModelcarInitContainerName { + t.Errorf("Expected the init container to be the model but got %s", pod.Spec.InitContainers[0].Name) + } else { + // Check that resources are correctly set. + if _, ok := pod.Spec.InitContainers[0].Resources.Limits[v1.ResourceCPU]; !ok { + t.Error("The model container does not have CPU limit set") + } + if _, ok := pod.Spec.InitContainers[0].Resources.Limits[v1.ResourceMemory]; !ok { + t.Error("The model container does not have Memory limit set") + } + if _, ok := pod.Spec.InitContainers[0].Resources.Requests[v1.ResourceCPU]; !ok { + t.Error("The model container does not have CPU request set") + } + if _, ok := pod.Spec.InitContainers[0].Resources.Requests[v1.ResourceMemory]; !ok { + t.Error("The model container does not have Memory request set") + } + + // Check args + joinedArgs := strings.Join(pod.Spec.InitContainers[0].Args, " ") + if !strings.Contains(joinedArgs, "Prefetched") { + t.Errorf("The model container args are not correctly setup. Got: %s", joinedArgs) + } + } + // Check that the user-container has an env var set found := false if pod.Spec.Containers[0].Env != nil {