Skip to content

Commit

Permalink
end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
  • Loading branch information
shivakunv committed Aug 14, 2024
1 parent a019667 commit cf2d998
Show file tree
Hide file tree
Showing 16 changed files with 325 additions and 0 deletions.
69 changes: 69 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: CI

on:
workflow_run:
workflows: [image]
types:
- completed
branches:
- main

jobs:
e2e-tests-nvidiadriver:
runs-on: ubuntu-latest
strategy:
matrix:
driver:
- 535.183.06
- 550.90.07

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Holodeck
uses: NVIDIA/holodeck@main
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"

- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
run: |
sudo chmod 644 ${{ github.workspace }}/.cache/key
echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key}
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }}
8 changes: 8 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh
16 changes: 16 additions & 0 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -xe

if [[ $# -ne 2 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}


TEST_DIR="$(pwd)/tests"

${TEST_DIR}/local.sh
26 changes: 26 additions & 0 deletions tests/holodeck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: holodeck.nvidia.com/v1alpha1
kind: Environment
metadata:
name: HOLODECK_NAME
description: "end-to-end test infrastructure"
spec:
provider: aws
auth:
keyName: cnt-ci
privateKey: HOLODECK_PRIVATE_KEY
instance:
type: g4dn.xlarge
region: us-west-1
ingressIpRanges:
- 0.0.0.0/0
image:
architecture: amd64
imageId: ami-0ce2cb35386fc22e9
containerRuntime:
install: true
name: containerd
kubernetes:
install: true
installer: kubeadm
version: v1.28.5
crictlVersion: v1.28.0
27 changes: 27 additions & 0 deletions tests/local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#! /bin/bash

if [[ $# -ge 1 ]]; then
TEST_CASE=${1}
test -n "${TEST_CASE}"
fi
test -f ${PROJECT_DIR}/${TEST_CASE}

export PROJECT="gpu-driver-container"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the installation of prerequisites on the remote instance
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh

# We trigger the specified test case on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
${TEST_CASE}
21 changes: 21 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
set -e

[[ -z "${DEBUG}" ]] || set -x

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )"
CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

# Set default values if not defined
: ${HELM:="helm"}
: ${PROJECT:="$(basename "${PROJECT_DIR}")"}

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}
5 changes: 5 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/env bash

function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
}
4 changes: 4 additions & 0 deletions tests/scripts/.rsync-excludes
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vendor/
.git
cnt-ci
key.pem
36 changes: 36 additions & 0 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

check_pod_ready() {
local pod_label=$1
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE}

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated")

if [ "${is_pod_ready}" = "True" ]; then
# Check if the pod is not in terminating state
is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated")
if [ "${is_pod_terminating}" != "" ]; then
echo "pod $pod_label is in terminating state..."
else
echo "Pod $pod_label is ready"
break;
fi
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
kubectl get pods -n ${TEST_NAMESPACE}

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
}
14 changes: 14 additions & 0 deletions tests/scripts/end-to-end-nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

echo ""
echo ""
echo "--------------Installing the GPU Operator--------------"

# Install the operator with usePrecompiled mode set to true
${SCRIPT_DIR}/install-operator.sh

"${SCRIPT_DIR}"/verify-operator.sh
echo "--------------Verification completed for GPU Operator--------------"
24 changes: 24 additions & 0 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"

# add helm driver repo
helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update

# Create the test namespace
kubectl create namespace "${TEST_NAMESPACE}"

# Run the helm install command
echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS"
${HELM} install gpu-operator nvidia/gpu-operator \
-n "${TEST_NAMESPACE}" \
${OPERATOR_OPTIONS} \
--wait
20 changes: 20 additions & 0 deletions tests/scripts/prerequisites.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

if [[ "${SKIP_PREREQUISITES}" == "true" ]]; then
echo "Skipping prerequisites: SKIP_PREREQUISITES=${SKIP_PREREQUISITES}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

export DEBIAN_FRONTEND=noninteractive

echo "Load kernel modules i2c_core and ipmi_msghandler"
sudo modprobe -a i2c_core ipmi_msghandler

echo "Install dependencies"
sudo apt update && sudo apt install -y jq

echo "Install Helm"
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
12 changes: 12 additions & 0 deletions tests/scripts/push.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

REMOTE_PROJECT_FOLDER="~/${PROJECT}"

# Copy over the contents of the project folder
${SCRIPT_DIR}/sync.sh \
"${PROJECT_DIR}/" \
"${instance_hostname}:${REMOTE_PROJECT_FOLDER}"
7 changes: 7 additions & 0 deletions tests/scripts/remote.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

ssh -i ${private_key} ${instance_hostname} "${@}"
17 changes: 17 additions & 0 deletions tests/scripts/sync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

if [[ "${SKIP_SYNC}" == "true" ]]; then
echo "Skipping sync: SKIP_SYNC=${SKIP_SYNC}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

source ${SCRIPT_DIR}/.local.sh

rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \
-avz --delete \
--exclude-from="${SCRIPT_DIR}/.rsync-excludes" \
${@}

19 changes: 19 additions & 0 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

if [[ "${SKIP_VERIFY}" == "true" ]]; then
echo "Skipping verify: SKIP_VERIFY=${SKIP_VERIFY}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

check_pod_ready "nvidia-driver-daemonset"
check_pod_ready "nvidia-container-toolkit-daemonset"
check_pod_ready "nvidia-device-plugin-daemonset"
check_pod_ready "nvidia-dcgm-exporter"
check_pod_ready "gpu-feature-discovery"
check_pod_ready "nvidia-operator-validator"

0 comments on commit cf2d998

Please sign in to comment.