diff --git a/.github/workflows/ci-precompiled.yaml b/.github/workflows/ci-precompiled.yaml new file mode 100644 index 00000000..dd284c42 --- /dev/null +++ b/.github/workflows/ci-precompiled.yaml @@ -0,0 +1,112 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Pre-Compiled End-to-end tests + +on: + workflow_run: + workflows: [image] + types: + - completed + branches: + - e2etestdriver_no + workflow_dispatch: + + pull_request: + types: + - opened + - synchronize + branches: + # - main + # - release-* + - e2etestdriver + push: + branches: + # - main + # - release-* + - e2etestdriver + +jobs: + e2e-tests-nvidiadriver: + # strategy: + # matrix: + # flavor: + # - aws + # - azure + # - generic + # - nvidia + # - oracle + runs-on: ubuntu-latest + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Set up Holodeck + uses: NVIDIA/holodeck@v0.2.1 + env: + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + with: + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} + holodeck_config: "tests/holodeck.yaml" + + - name: Get public dns name + id: get_public_dns_name + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Set and Calculate test vars + run: | + echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem + echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + + - name: Precompiled e2e test- upgrade kernel and Validate gpu driver + env: + # space separated multiple target script can be passed to target + TARGET_SCRIPT: "./tests/cases/precompiled-nvidia-driver.sh ./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + run: | + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + # SHIVA change below to KERNEL_FLAVOR + DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1) + DRIVER_VERSION="${DRIVER_BRANCH}" + status=0 + ./tests/precompile-task_executor.sh "${TARGET_SCRIPT}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + if [ $status -ne 0 ]; then + echo "Precompiled e2e validation failed for driver branch $DRIVER_VERSION and kernel flavor $KERNEL_FLAVOR with status $status" + rc=$status + fi + done + ./tests/scripts/pull.sh /tmp/logs logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-Precompiled-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c6e8690..e8445c84 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,21 @@ on: types: - completed branches: - - main + - e2etestdriver + + pull_request: + types: + - opened + - synchronize + branches: + # - main + # - release-* + - e2etestdriver + push: + branches: + # - main + # - release-* + - e2etestdriver jobs: e2e-tests-nvidiadriver: @@ -41,36 +55,38 @@ jobs: aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws_ssh_key: ${{ secrets.AWS_SSH_KEY }} holodeck_config: "tests/holodeck.yaml" - + - name: Get public dns name id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV + echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" + OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | - echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} rc=0 for driver_version in ${DRIVER_VERSIONS}; do echo "Running e2e for DRIVER_VERSION=$driver_version" - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + status=0 + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$? if [ $status -ne 0 ]; then echo "e2e validation failed for driver version $driver_version with status $status" rc=$status fi done - source ./tests/scripts/.definitions.sh ./tests/scripts/pull.sh /tmp/logs logs exit $rc @@ -80,4 +96,4 @@ jobs: with: name: nvidiadriver-e2e-test-logs path: ./logs/ - retention-days: 15 + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index bfeb3dbd..ef794ed3 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -21,12 +21,14 @@ on: - opened - synchronize branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver push: branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver jobs: image: diff --git a/tests/cases/precompiled-nvidia-driver.sh b/tests/cases/precompiled-nvidia-driver.sh new file mode 100755 index 00000000..dc0870b8 --- /dev/null +++ b/tests/cases/precompiled-nvidia-driver.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +#upgrade kernel +"${SCRIPTS_DIR}"/upgrade-kernel.sh diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 621a7a8e..7ff11352 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,14 +2,14 @@ set -xe -if [[ $# -ne 2 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION are required" +if [[ $# -ne 3 ]]; then + echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required" exit 1 fi -export TEST_CASE=${1} -export TARGET_DRIVER_VERSION=${2} - +export TEST_CASE="${1}" +export TARGET_DRIVER_VERSION="${2}" +export OPERATOR_OPTIONS="${3}" TEST_DIR="$(pwd)/tests" diff --git a/tests/local.sh b/tests/local.sh index 86918588..67167522 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -24,4 +24,5 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. remote \ PROJECT="${PROJECT}" \ TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ + OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \ ${TEST_CASE} diff --git a/tests/precompile-task_executor.sh b/tests/precompile-task_executor.sh new file mode 100755 index 00000000..97337c1f --- /dev/null +++ b/tests/precompile-task_executor.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -xe + +if [[ $# -ne 3 ]]; then + echo "TARGET_SCRIPT TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required" + exit 1 +fi + +export TARGET_SCRIPT="${1}" +export TARGET_DRIVER_VERSION="${2}" +export OPERATOR_OPTIONS="${3}" + +SCRIPT_DIR="$(pwd)/tests/scripts" + +IFS=' ' read -r -a array <<< "$TARGET_SCRIPT" +# execute , wait , try-connect target script +for target_script in "${array[@]}"; do + echo "target_script: $target_script" + ./tests/ci-run-e2e.sh "${target_script}" "${TARGET_DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + # On the target system, all scripts/test-case exit with code 1 for error handling. + # However, since reboot-related disconnections break the SSH connection + # and can cause the entire job to exit, we should ignore all errors except + # exit code 1. During a reboot, exit code 1 will not be thrown, so handling + # other errors as code 1 will ensure proper management of reboot scenarios + if [ $status -eq 1 ]; then + echo "e2e validation failed" + exit 1 + fi + sleep 30 + ${SCRIPT_DIR}/remote_retry.sh +done diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index f254bc00..0b4e2d7e 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${TEST_NAMESPACE:="test-operator"} -: ${PRIVATE_REGISTRY:="ghcr.io"} - : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} @@ -24,3 +22,9 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} + +: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"} +: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} + +: ${BASE_TARGET:="jammy"} +: ${GRUB_FILE:="/boot/grub/grub.cfg"} diff --git a/tests/scripts/.local.sh b/tests/scripts/.local.sh index 7971a404..f3d98b2f 100644 --- a/tests/scripts/.local.sh +++ b/tests/scripts/.local.sh @@ -3,3 +3,7 @@ function remote() { ${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@"" } + +function remote_retry() { + ${SCRIPT_DIR}/remote_retry.sh +} diff --git a/tests/scripts/.rsync-excludes b/tests/scripts/.rsync-excludes deleted file mode 100644 index 06c2f6ef..00000000 --- a/tests/scripts/.rsync-excludes +++ /dev/null @@ -1,4 +0,0 @@ -vendor/ -.git -cnt-ci -key.pem diff --git a/tests/scripts/.rsync-includes b/tests/scripts/.rsync-includes new file mode 100644 index 00000000..f91de959 --- /dev/null +++ b/tests/scripts/.rsync-includes @@ -0,0 +1,2 @@ +tests/ +tests/*** diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index d272efab..ab2db9a1 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -11,7 +11,7 @@ ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh -echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" +echo "--------------Verification completed for GPU Operator, uninstalling the GPU operator--------------" ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh new file mode 100755 index 00000000..4ae834b7 --- /dev/null +++ b/tests/scripts/findkernelversion.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +export REGCTL_VERSION=v0.4.7 +mkdir -p bin +curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 +chmod a+x bin/regctl +export PATH=$(pwd)/bin:${PATH} +DRIVER_BRANCH=$(echo "${TARGET_DRIVER_VERSION}" | cut -d '.' -f 1) +KERNEL_FLAVOR=$(uname -r | awk -F'-' '{print $3}') +regctl image get-file ghcr.io/nvidia/driver:base-${BASE_TARGET}-${KERNEL_FLAVOR}-${DRIVER_BRANCH} /var/kernel_version.txt ${LOG_DIR}/kernel_version.txt || true diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index 3acfcffb..b6785512 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -5,10 +5,14 @@ if [[ "${SKIP_INSTALL}" == "true" ]]; then exit 0 fi +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}" +OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update @@ -17,8 +21,8 @@ helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update kubectl create namespace "${TEST_NAMESPACE}" # Run the helm install command -echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS" -${HELM} install gpu-operator nvidia/gpu-operator \ +echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}" +eval ${HELM} install gpu-operator nvidia/gpu-operator \ -n "${TEST_NAMESPACE}" \ - ${OPERATOR_OPTIONS} \ + "${OPERATOR_OPTIONS}" \ --wait diff --git a/tests/scripts/kernel-upgrade-helper.sh b/tests/scripts/kernel-upgrade-helper.sh new file mode 100755 index 00000000..4c49057b --- /dev/null +++ b/tests/scripts/kernel-upgrade-helper.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source "${SCRIPT_DIR}"/.definitions.sh + +# finding kernel version +${SCRIPT_DIR}/findkernelversion.sh +source "${LOG_DIR}"/kernel_version.txt + +echo "Checking current kernel version..." +CURRENT_KERNEL=$(uname -r) +echo "Current kernel version: $CURRENT_KERNEL" + +if [ "${CURRENT_KERNEL}" != ${KERNEL_VERSION} ]; then + echo "" + echo "" + echo "--------------Upgrading kernel to ${KERNEL_VERSION}--------------" + + sudo apt-get update -y + sudo apt-get install --allow-downgrades linux-image-${KERNEL_VERSION} -y + if [ $? -ne 0 ]; then + echo "Kernel upgrade failed." + exit 1 + fi + + echo "update grub..." + # submenu menuentry kernel index calculation: + # init count=-2 due to + # 0-1 as "submenu 'Advanced options for ..." also included in count + # 0-1-1 as start index is 0 not 1 + kernel_index=$(awk -v kernel_version="$KERNEL_VERSION" ' + BEGIN { + count = -2 + submenu_depth = 0 + match_found = 0 + } + + /^submenu / { + submenu_depth++ + } + + # Match menuentry lines and match for the kernel version + /menuentry/ && submenu_depth > 0 && match_found == 0 { + count++ + #print "Found match at Index " count ": " $0 + if ($0 ~ kernel_version) { + print count + match_found = 1 + } + } + + /^}/ { + if (submenu_depth > 0) { + submenu_depth-- + } + } + ' "$GRUB_FILE") + sudo sed -i "s/^GRUB_DEFAULT=.*/GRUB_DEFAULT=\"1>${kernel_index}\"/" /etc/default/grub + sudo update-grub + + echo "Rebooting ..." + # Run the reboot command with nohup to avoid abrupt SSH closure issues + nohup sudo reboot & + + echo "--------------Kernel upgrade completed--------------" +else + echo "--------------Kernel upgrade not required, current kernel version ${KERNEL_VERSION}--------------" +fi + +# Exit with a success code since the reboot command was issued successfully +exit 0 diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh new file mode 100755 index 00000000..42a94443 --- /dev/null +++ b/tests/scripts/remote_retry.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +try_ssh_connection() { + status=0 + ssh -o ConnectTimeout=20 -i ${private_key} ${instance_hostname} "exit" || status=$? + return $status +} + +echo "Waiting for aws system to come back online..." +START_TIME=$(date +%s) +while true; do + try_ssh_connection + if [ $? -eq 0 ]; then + echo "Successfully connected to aws system after reboot." + break; + fi + ELAPSED_TIME=$(($(date +%s) - START_TIME)) + if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then + echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot." + exit 1 + fi + sleep 30 + echo "ssh retry again..." +done diff --git a/tests/scripts/sync.sh b/tests/scripts/sync.sh index cb020752..555d7b86 100755 --- a/tests/scripts/sync.sh +++ b/tests/scripts/sync.sh @@ -12,6 +12,7 @@ source ${SCRIPT_DIR}/.local.sh rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ -avz --delete \ - --exclude-from="${SCRIPT_DIR}/.rsync-excludes" \ + --include-from="${SCRIPT_DIR}/.rsync-includes" \ + --exclude='*' \ ${@} diff --git a/tests/scripts/upgrade-kernel.sh b/tests/scripts/upgrade-kernel.sh new file mode 100755 index 00000000..e7d90ec3 --- /dev/null +++ b/tests/scripts/upgrade-kernel.sh @@ -0,0 +1,8 @@ +#! /bin/bash +# This test case runs the operator installation / test case with the default options. + +SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" +source "${SCRIPTS_DIR}"/.definitions.sh + +# Run an end-to-end test cycle +"${SCRIPTS_DIR}"/kernel-upgrade-helper.sh