From 8fb7ff84941212ea1540865798a95fa340e2b881 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Tue, 22 Oct 2024 16:11:39 -0700 Subject: [PATCH 1/3] eks install guide --- charts/kubeai/values-eks.yaml | 29 ++++++ charts/kubeai/values.yaml | 13 +++ docs/installation/eks.md | 163 ++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 charts/kubeai/values-eks.yaml create mode 100644 docs/installation/eks.md diff --git a/charts/kubeai/values-eks.yaml b/charts/kubeai/values-eks.yaml new file mode 100644 index 00000000..9ea125b2 --- /dev/null +++ b/charts/kubeai/values-eks.yaml @@ -0,0 +1,29 @@ +resourceProfiles: + nvidia-gpu-l4: + nodeSelector: + # This only works if you use Karpenter. + karpenter.k8s.aws/instance-gpu-name: "l4" + nvidia-gpu-l40s: + nodeSelector: + # This only works if you use Karpenter. + karpenter.k8s.aws/instance-gpu-name: "l40s" + nvidia-gpu-h100: + nodeSelector: + # H100 have 8 GPU machine shapes only. This doesn't require Karpenter. + node.kubernetes.io/instance-type: "p5.48xlarge" + nvidia-gpu-a100-80gb: + nodeSelector: + # A100 have 8 GPU machine shapes only. This doesn't require Karpenter. + node.kubernetes.io/instance-type: "p4d.24xlarge" + nvidia-gpu-a100-40gb: + nodeSelector: + # A100 have 8 GPU machine shapes only. This doesn't require Karpenter. + node.kubernetes.io/instance-type: "p4de.24xlarge" + +cacheProfiles: + efs-dynamic: + sharedFilesystem: + storageClassName: "efs-sc" + efs-static: + sharedFilesystem: + persistentVolumeName: "efs-pv" \ No newline at end of file diff --git a/charts/kubeai/values.yaml b/charts/kubeai/values.yaml index 8d205b0a..707a0efc 100644 --- a/charts/kubeai/values.yaml +++ b/charts/kubeai/values.yaml @@ -75,6 +75,19 @@ resourceProfiles: operator: "Equal" value: "present" effect: "NoSchedule" + nvidia-gpu-l40s: + imageName: "nvidia-gpu" + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + cpu: "6" + memory: "24Gi" + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: "NoSchedule" nvidia-gpu-h100: imageName: "nvidia-gpu" limits: diff --git a/docs/installation/eks.md b/docs/installation/eks.md new file mode 100644 index 00000000..9463a70f --- /dev/null +++ b/docs/installation/eks.md @@ -0,0 +1,163 @@ +# Install on EKS + +
+TIP: Make sure you have enough GPU quota in your AWS account. + +The default quotas for GPU instances are often 0. You will need to request a quota increase for the GPU instances you want to use. + +The following quotas may require an increase if you wish to use GPUs in your EKS cluster: +- All G and VT Spot Instance Requests +- All P5 Spot Instance Requests +- All P4, P3 and P2 Spot Instance Requests +- Running Dedicated p4d Hosts + +
+ +## 1. Create EKS cluster with Karpenter + +Configuration used throughout this guide: + +```bash +export CLUSTER_NAME="cluster-with-karpenter" +export AWS_DEFAULT_REGION="us-west-2" +export K8S_VERSION="1.30" +export GPU_AMI_ID="$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)" +``` + +Tweak the above environment variables as needed and re-run if you lose your shell session. + +Create the EKS cluster using eksctl: +```bash +eksctl create cluster -f - < kubeai-models.yaml +catalog: + llama-3.1-8b-instruct-fp8-l4: + enabled: true +EOF + +helm install kubeai-models kubeai/models \ + -f ./kubeai-models.yaml +``` \ No newline at end of file From 3021164d1d1e50f42517d4218aa2eeafdff03b56 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Thu, 24 Oct 2024 18:50:55 -0700 Subject: [PATCH 2/3] use karpenter based node-labels --- charts/kubeai/values-eks.yaml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/charts/kubeai/values-eks.yaml b/charts/kubeai/values-eks.yaml index 9ea125b2..44274b03 100644 --- a/charts/kubeai/values-eks.yaml +++ b/charts/kubeai/values-eks.yaml @@ -1,24 +1,21 @@ resourceProfiles: nvidia-gpu-l4: nodeSelector: - # This only works if you use Karpenter. karpenter.k8s.aws/instance-gpu-name: "l4" nvidia-gpu-l40s: nodeSelector: - # This only works if you use Karpenter. karpenter.k8s.aws/instance-gpu-name: "l40s" nvidia-gpu-h100: nodeSelector: - # H100 have 8 GPU machine shapes only. This doesn't require Karpenter. - node.kubernetes.io/instance-type: "p5.48xlarge" + karpenter.k8s.aws/instance-gpu-name: "h100" nvidia-gpu-a100-80gb: nodeSelector: - # A100 have 8 GPU machine shapes only. This doesn't require Karpenter. - node.kubernetes.io/instance-type: "p4d.24xlarge" + karpenter.k8s.aws/instance-gpu-name: "a100" + karpenter.k8s.aws/instance-gpu-memory: "81920" nvidia-gpu-a100-40gb: nodeSelector: - # A100 have 8 GPU machine shapes only. This doesn't require Karpenter. - node.kubernetes.io/instance-type: "p4de.24xlarge" + karpenter.k8s.aws/instance-gpu-name: "a100" + karpenter.k8s.aws/instance-gpu-memory: "40960" cacheProfiles: efs-dynamic: From 5ddf51d324947d8e399504e7ab87055c2f7a7e86 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Thu, 24 Oct 2024 19:09:56 -0700 Subject: [PATCH 3/3] address comments --- docs/installation/eks.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/installation/eks.md b/docs/installation/eks.md index 9463a70f..c4edf6bf 100644 --- a/docs/installation/eks.md +++ b/docs/installation/eks.md @@ -15,7 +15,7 @@ The following quotas may require an increase if you wish to use GPUs in your EKS ## 1. Create EKS cluster with Karpenter -Configuration used throughout this guide: +Set the environment variables used throughout this guide: ```bash export CLUSTER_NAME="cluster-with-karpenter" @@ -24,8 +24,6 @@ export K8S_VERSION="1.30" export GPU_AMI_ID="$(aws ssm get-parameter --name /aws/service/eks/optimized-ami/${K8S_VERSION}/amazon-linux-2-gpu/recommended/image_id --query Parameter.Value --output text)" ``` -Tweak the above environment variables as needed and re-run if you lose your shell session. - Create the EKS cluster using eksctl: ```bash eksctl create cluster -f - <