Skip to content

Commit

Permalink
Modelcar race condition mitigation with an init container
Browse files Browse the repository at this point in the history
This adds the model container as an init-container to mitigate a race
condition that would happen if the model container is not present on the
cluster-node. The race condition happens if the cluster is able to fetch
and start the runtime container before the modelcar is fetched. This
would lead to the runtime to terminate with error.

By configuring the model container as an init-container the runtime
won't start until the modelcar is fetched. Although there is still the
risk of a race condition when the cluster schedules the runtime
container first, the pod should stabilize after a few restarts of the
runtime container and should either prevent a CrashLoopBackOff event on
the pod, or the crash event would finish quickly.

This improves compatibility with the runtimes which can now stay
agnostic to the modelcar implementation, until better techniques (like
native sidecars, and oci volume mounts) become mature.

Signed-off-by: Edgar Hernández <[email protected]>
  • Loading branch information
israel-hdez committed Sep 13, 2024
1 parent 43be4a8 commit 6d332d7
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
42 changes: 42 additions & 0 deletions pkg/webhook/admission/pod/storage_initializer_injector.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const (
PvcSourceMountPath = "/mnt/pvc"
CaBundleVolumeName = "cabundle-cert"
ModelcarContainerName = "modelcar"
ModelcarInitContainerName = "modelcar-init"
ModelInitModeEnv = "MODEL_INIT_MODE"
CpuModelcarDefault = "10m"
MemoryModelcarDefault = "15Mi"
Expand Down Expand Up @@ -172,6 +173,11 @@ func (mi *StorageInitializerInjector) InjectModelcar(pod *v1.Pod) error {
if getContainerWithName(pod, ModelcarContainerName) == nil {
modelContainer := mi.createModelContainer(image, constants.DefaultModelLocalMountPath)
pod.Spec.Containers = append(pod.Spec.Containers, *modelContainer)

// Add the model container as an init-container to pre-fetch the model before
// the runtimes starts.
modelInitContainer := mi.createModelInitContainer(image)
pod.Spec.InitContainers = append(pod.Spec.InitContainers, *modelInitContainer)
}

// Enable process namespace sharing so that the modelcar's root filesystem
Expand Down Expand Up @@ -684,6 +690,42 @@ func (mi *StorageInitializerInjector) createModelContainer(image string, modelPa
return modelContainer
}

func (mi *StorageInitializerInjector) createModelInitContainer(image string) *v1.Container {
cpu := mi.config.CpuModelcar
if cpu == "" {
cpu = CpuModelcarDefault
}
memory := mi.config.MemoryModelcar
if memory == "" {
memory = MemoryModelcarDefault
}

modelContainer := &v1.Container{
Name: ModelcarInitContainerName,
Image: image,
Args: []string{
"sh",
"-c",
// Check that the expected models directory exists
"echo 'Pre-fetching modelcar " + image + ": ' && [ -d /models ] && [ \"$$(ls -A /models)\" ] && echo 'OK ... Prefetched and valid (/models exists)' || (echo 'NOK ... Prefetched but modelcar is invalid (/models does not exist or is empty)' && exit 1)",
},
Resources: v1.ResourceRequirements{
Limits: map[v1.ResourceName]resource.Quantity{
// Could possibly be reduced to even less
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
},
Requests: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse(cpu),
v1.ResourceMemory: resource.MustParse(memory),
},
},
TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
}

return modelContainer
}

// addEmptyDirVolumeIfNotPresent adds an emptyDir volume only if not present in the
// list. pod and pod.Spec must not be nil
func addEmptyDirVolumeIfNotPresent(pod *v1.Pod, name string) {
Expand Down
27 changes: 27 additions & 0 deletions pkg/webhook/admission/pod/storage_initializer_injector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,33 @@ func TestInjectModelcar(t *testing.T) {
t.Errorf("Expected two containers but got %d", len(pod.Spec.Containers))
}

// Check that an init container has been injected, and it is the model container
if len(pod.Spec.InitContainers) != 1 {
t.Errorf("Expected one init container but got %d", len(pod.Spec.InitContainers))
} else if pod.Spec.InitContainers[0].Name != ModelcarInitContainerName {
t.Errorf("Expected the init container to be the model but got %s", pod.Spec.InitContainers[0].Name)
} else {
// Check that resources are correctly set.
if _, ok := pod.Spec.InitContainers[0].Resources.Limits[v1.ResourceCPU]; !ok {
t.Error("The model container does not have CPU limit set")
}
if _, ok := pod.Spec.InitContainers[0].Resources.Limits[v1.ResourceMemory]; !ok {
t.Error("The model container does not have Memory limit set")
}
if _, ok := pod.Spec.InitContainers[0].Resources.Requests[v1.ResourceCPU]; !ok {
t.Error("The model container does not have CPU request set")
}
if _, ok := pod.Spec.InitContainers[0].Resources.Requests[v1.ResourceMemory]; !ok {
t.Error("The model container does not have Memory request set")
}

// Check args
joinedArgs := strings.Join(pod.Spec.InitContainers[0].Args, " ")
if !strings.Contains(joinedArgs, "Prefetched") {
t.Errorf("The model container args are not correctly setup. Got: %s", joinedArgs)
}
}

// Check that the user-container has an env var set
found := false
if pod.Spec.Containers[0].Env != nil {
Expand Down

0 comments on commit 6d332d7

Please sign in to comment.