From cf2d99846fb5905d3686708461328f5e6354abf3 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Tue, 13 Aug 2024 09:30:54 +0530 Subject: [PATCH] end-to-end gpu driver validation --- .github/workflows/ci.yaml | 69 +++++++++++++++++++++++ tests/cases/nvidia-driver.sh | 8 +++ tests/ci-run-e2e.sh | 16 ++++++ tests/holodeck.yaml | 26 +++++++++ tests/local.sh | 27 +++++++++ tests/scripts/.definitions.sh | 21 +++++++ tests/scripts/.local.sh | 5 ++ tests/scripts/.rsync-excludes | 4 ++ tests/scripts/checks.sh | 36 ++++++++++++ tests/scripts/end-to-end-nvidia-driver.sh | 14 +++++ tests/scripts/install-operator.sh | 24 ++++++++ tests/scripts/prerequisites.sh | 20 +++++++ tests/scripts/push.sh | 12 ++++ tests/scripts/remote.sh | 7 +++ tests/scripts/sync.sh | 17 ++++++ tests/scripts/verify-operator.sh | 19 +++++++ 16 files changed, 325 insertions(+) create mode 100644 .github/workflows/ci.yaml create mode 100755 tests/cases/nvidia-driver.sh create mode 100755 tests/ci-run-e2e.sh create mode 100644 tests/holodeck.yaml create mode 100755 tests/local.sh create mode 100644 tests/scripts/.definitions.sh create mode 100644 tests/scripts/.local.sh create mode 100644 tests/scripts/.rsync-excludes create mode 100755 tests/scripts/checks.sh create mode 100755 tests/scripts/end-to-end-nvidia-driver.sh create mode 100755 tests/scripts/install-operator.sh create mode 100755 tests/scripts/prerequisites.sh create mode 100755 tests/scripts/push.sh create mode 100755 tests/scripts/remote.sh create mode 100755 tests/scripts/sync.sh create mode 100755 tests/scripts/verify-operator.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 00000000..cc2ce1f8 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: CI + +on: + workflow_run: + workflows: [image] + types: + - completed + branches: + - main + +jobs: + e2e-tests-nvidiadriver: + runs-on: ubuntu-latest + strategy: + matrix: + driver: + - 535.183.06 + - 550.90.07 + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Holodeck + uses: NVIDIA/holodeck@main + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + + - name: Validate gpu driver + env: + TEST_CASE: "./tests/cases/nvidia-driver.sh" + run: | + sudo chmod 644 ${{ github.workspace }}/.cache/key + echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} + ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }} diff --git a/tests/cases/nvidia-driver.sh b/tests/cases/nvidia-driver.sh new file mode 100755 index 00000000..d2afad83 --- /dev/null +++ b/tests/cases/nvidia-driver.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh new file mode 100755 index 00000000..621a7a8e --- /dev/null +++ b/tests/ci-run-e2e.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -xe + +if [[ $# -ne 2 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION are required" + exit 1 +fi + +export TEST_CASE=${1} +export TARGET_DRIVER_VERSION=${2} + + +TEST_DIR="$(pwd)/tests" + +${TEST_DIR}/local.sh diff --git a/tests/holodeck.yaml b/tests/holodeck.yaml new file mode 100644 index 00000000..39e4d177 --- /dev/null +++ b/tests/holodeck.yaml @@ -0,0 +1,26 @@ +apiVersion: holodeck.nvidia.com/v1alpha1 +kind: Environment +metadata: + name: HOLODECK_NAME + description: "end-to-end test infrastructure" +spec: + provider: aws + auth: + keyName: cnt-ci + privateKey: HOLODECK_PRIVATE_KEY + instance: + type: g4dn.xlarge + region: us-west-1 + ingressIpRanges: + - 0.0.0.0/0 + image: + architecture: amd64 + imageId: ami-0ce2cb35386fc22e9 + containerRuntime: + install: true + name: containerd + kubernetes: + install: true + installer: kubeadm + version: v1.28.5 + crictlVersion: v1.28.0 diff --git a/tests/local.sh b/tests/local.sh new file mode 100755 index 00000000..86918588 --- /dev/null +++ b/tests/local.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +if [[ $# -ge 1 ]]; then + TEST_CASE=${1} + test -n "${TEST_CASE}" +fi +test -f ${PROJECT_DIR}/${TEST_CASE} + +export PROJECT="gpu-driver-container" + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +# Sync the project folder to the remote +${SCRIPT_DIR}/push.sh + +# We trigger the installation of prerequisites on the remote instance +remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh + +# We trigger the specified test case on the remote instance. +# Note: We need to ensure that the required environment variables +# are forwarded to the remote shell. +remote \ + PROJECT="${PROJECT}" \ + TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + ${TEST_CASE} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh new file mode 100644 index 00000000..4892ea17 --- /dev/null +++ b/tests/scripts/.definitions.sh @@ -0,0 +1,21 @@ +#!/bin/bash +set -e + +[[ -z "${DEBUG}" ]] || set -x + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" +PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )" +CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" + +# Set default values if not defined +: ${HELM:="helm"} +: ${PROJECT:="$(basename "${PROJECT_DIR}")"} + +: ${TEST_NAMESPACE:="test-operator"} + +: ${PRIVATE_REGISTRY:="ghcr.io"} + +: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} + +: ${TARGET_DRIVER_VERSION:="550.90.07"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh new file mode 100644 index 00000000..7971a404 --- /dev/null +++ b/tests/scripts/.local.sh @@ -0,0 +1,5 @@ +#!/usr/env bash + +function remote() { + ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" +} diff --git a/tests/scripts/.rsync-excludes b/tests/scripts/.rsync-excludes new file mode 100644 index 00000000..06c2f6ef --- /dev/null +++ b/tests/scripts/.rsync-excludes @@ -0,0 +1,4 @@ +vendor/ +.git +cnt-ci +key.pem diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh new file mode 100755 index 00000000..72658653 --- /dev/null +++ b/tests/scripts/checks.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +check_pod_ready() { + local pod_label=$1 + local current_time=0 + while :; do + echo "Checking $pod_label pod" + kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} + + echo "Checking $pod_label pod readiness" + is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated") + + if [ "${is_pod_ready}" = "True" ]; then + # Check if the pod is not in terminating state + is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated") + if [ "${is_pod_terminating}" != "" ]; then + echo "pod $pod_label is in terminating state..." + else + echo "Pod $pod_label is ready" + break; + fi + fi + + if [[ "${current_time}" -gt $((60 * 45)) ]]; then + echo "timeout reached" + exit 1; + fi + + # Echo useful information on stdout + kubectl get pods -n ${TEST_NAMESPACE} + + echo "Sleeping 5 seconds" + current_time=$((${current_time} + 5)) + sleep 5 + done +} diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh new file mode 100755 index 00000000..e87bdba9 --- /dev/null +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +echo "" +echo "" +echo "--------------Installing the GPU Operator--------------" + +# Install the operator with usePrecompiled mode set to true +${SCRIPT_DIR}/install-operator.sh + +"${SCRIPT_DIR}"/verify-operator.sh +echo "--------------Verification completed for GPU Operator--------------" diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh new file mode 100755 index 00000000..3acfcffb --- /dev/null +++ b/tests/scripts/install-operator.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" + +# add helm driver repo +helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update + +# Create the test namespace +kubectl create namespace "${TEST_NAMESPACE}" + +# Run the helm install command +echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" +${HELM} install gpu-operator nvidia/gpu-operator \ + -n "${TEST_NAMESPACE}" \ + ${OPERATOR_OPTIONS} \ + --wait diff --git a/tests/scripts/prerequisites.sh b/tests/scripts/prerequisites.sh new file mode 100755 index 00000000..ee985e55 --- /dev/null +++ b/tests/scripts/prerequisites.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +if [[ "${SKIP_PREREQUISITES}" == "true" ]]; then + echo "Skipping prerequisites: SKIP_PREREQUISITES=${SKIP_PREREQUISITES}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export DEBIAN_FRONTEND=noninteractive + +echo "Load kernel modules i2c_core and ipmi_msghandler" +sudo modprobe -a i2c_core ipmi_msghandler + +echo "Install dependencies" +sudo apt update && sudo apt install -y jq + +echo "Install Helm" +curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash diff --git a/tests/scripts/push.sh b/tests/scripts/push.sh new file mode 100755 index 00000000..afc350ab --- /dev/null +++ b/tests/scripts/push.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +REMOTE_PROJECT_FOLDER="~/${PROJECT}" + +# Copy over the contents of the project folder +${SCRIPT_DIR}/sync.sh \ + "${PROJECT_DIR}/" \ + "${instance_hostname}:${REMOTE_PROJECT_FOLDER}" diff --git a/tests/scripts/remote.sh b/tests/scripts/remote.sh new file mode 100755 index 00000000..40c45e8f --- /dev/null +++ b/tests/scripts/remote.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +ssh -i ${private_key} ${instance_hostname} "${@}" diff --git a/tests/scripts/sync.sh b/tests/scripts/sync.sh new file mode 100755 index 00000000..cb020752 --- /dev/null +++ b/tests/scripts/sync.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +if [[ "${SKIP_SYNC}" == "true" ]]; then + echo "Skipping sync: SKIP_SYNC=${SKIP_SYNC}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +source ${SCRIPT_DIR}/.local.sh + +rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ + -avz --delete \ + --exclude-from="${SCRIPT_DIR}/.rsync-excludes" \ + ${@} + diff --git a/tests/scripts/verify-operator.sh b/tests/scripts/verify-operator.sh new file mode 100755 index 00000000..8836d560 --- /dev/null +++ b/tests/scripts/verify-operator.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [[ "${SKIP_VERIFY}" == "true" ]]; then + echo "Skipping verify: SKIP_VERIFY=${SKIP_VERIFY}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +# Import the check definitions +source ${SCRIPT_DIR}/checks.sh + +check_pod_ready "nvidia-driver-daemonset" +check_pod_ready "nvidia-container-toolkit-daemonset" +check_pod_ready "nvidia-device-plugin-daemonset" +check_pod_ready "nvidia-dcgm-exporter" +check_pod_ready "gpu-feature-discovery" +check_pod_ready "nvidia-operator-validator"