From ae9de5cdad791cb2944382c59362b9e91ba3ce5f Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Fri, 16 Aug 2024 15:51:30 +0530 Subject: [PATCH] end-to-end gpu driver testing enhancement Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 45 +++++- .github/workflows/image.yaml | 161 +++++++++++----------- tests/scripts/.definitions.sh | 7 + tests/scripts/checks.sh | 49 ++++--- tests/scripts/end-to-end-nvidia-driver.sh | 1 - tests/scripts/pull.sh | 12 ++ tests/scripts/remote.sh | 3 +- tests/scripts/verify-operator.sh | 48 ++++++- 8 files changed, 206 insertions(+), 120 deletions(-) create mode 100755 tests/scripts/pull.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cc2ce1f8..b6e4acb8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: CI +name: End-to-end tests on: workflow_run: @@ -22,14 +22,20 @@ on: branches: - main + # SHIVA + pull_request: + types: + - opened + - synchronize + branches: + - enhancegpuvalidation + push: + branches: + - enhancegpuvalidation + jobs: e2e-tests-nvidiadriver: runs-on: ubuntu-latest - strategy: - matrix: - driver: - - 535.183.06 - - 550.90.07 steps: - name: Check out code @@ -41,6 +47,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + #SHIVA AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} with: aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -59,6 +66,8 @@ jobs: echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - name: Validate gpu driver env: @@ -66,4 +75,26 @@ jobs: run: | sudo chmod 644 ${{ github.workspace }}/.cache/key echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }} + rc=0 + echo "SHIVA ${DRIVER_VERSIONS}" + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + # ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} + # SHIVA + ./tests/ci-run-e2e.sh ${TEST_CASE} ${driver_version} || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $driver_version with status $status" + rc=$status + fi + done + source ./tests/scripts/.definitions.sh + ./tests/scripts/pull.sh ${LOG_DIR} logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index c167a324..7149653e 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -93,86 +93,89 @@ jobs: IMAGE_NAME: ghcr.io/nvidia/driver VERSION: ${COMMIT_SHORT_SHA} run: | - DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }} + # SHIVA + # DRIVER_VERSIONS=${{ matrix.driver }} make build-${{ matrix.dist }}-${{ matrix.driver }} + echo "SHIVA compeleted image building" - pre-compiled: - runs-on: ubuntu-latest - strategy: - matrix: - driver: - - 535 - - 550 - flavor: - - aws - - azure - - generic - - nvidia - - oracle - ispr: - - ${{github.event_name == 'pull_request'}} - exclude: - - ispr: true - flavor: azure - - ispr: true - flavor: aws - - ispr: true - flavor: nvidia - - ispr: true - flavor: oracle - steps: - - uses: actions/checkout@v4 - name: Check out code - - name: Calculate build vars - id: vars - run: | - echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV - echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV - REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" - echo "${REPO_FULL_NAME}" - echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV +# SHIVA + # pre-compiled: + # runs-on: ubuntu-latest + # strategy: + # matrix: + # driver: + # - 535 + # - 550 + # flavor: + # - aws + # - azure + # - generic + # - nvidia + # - oracle + # ispr: + # - ${{github.event_name == 'pull_request'}} + # exclude: + # - ispr: true + # flavor: azure + # - ispr: true + # flavor: aws + # - ispr: true + # flavor: nvidia + # - ispr: true + # flavor: oracle + # steps: + # - uses: actions/checkout@v4 + # name: Check out code + # - name: Calculate build vars + # id: vars + # run: | + # echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + # echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV + # REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}" + # echo "${REPO_FULL_NAME}" + # echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - GENERATE_ARTIFACTS="false" - if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then - GENERATE_ARTIFACTS="false" - elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then - GENERATE_ARTIFACTS="true" - elif [[ "${{ github.event_name }}" == "push" ]]; then - GENERATE_ARTIFACTS="true" - fi - echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV + # GENERATE_ARTIFACTS="false" + # if [[ "${{ github.actor }}" == "dependabot[bot]" ]]; then + # GENERATE_ARTIFACTS="false" + # elif [[ "${{ github.event_name }}" == "pull_request" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then + # GENERATE_ARTIFACTS="true" + # elif [[ "${{ github.event_name }}" == "push" ]]; then + # GENERATE_ARTIFACTS="true" + # fi + # echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV + # echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build base image and get kernel version - env: - IMAGE_NAME: ghcr.io/nvidia/driver - VERSION: ${COMMIT_SHORT_SHA} - BASE_TARGET: jammy - run: | - make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} + # - name: Set up QEMU + # uses: docker/setup-qemu-action@v3 + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.actor }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: Build base image and get kernel version + # env: + # IMAGE_NAME: ghcr.io/nvidia/driver + # VERSION: ${COMMIT_SHORT_SHA} + # BASE_TARGET: jammy + # run: | + # make DRIVER_BRANCH=${{ matrix.driver }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} - trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT - docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} - # try 3 times every 10 seconds to get the file, if success exit the loop - for i in {1..3}; do - docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break - sleep 10 - done - - name: Build image - env: - IMAGE_NAME: ghcr.io/nvidia/driver - VERSION: ${COMMIT_SHORT_SHA} - PRECOMPILED: "true" - DIST: signed_ubuntu22.04 - run: | - source kernel_version.txt && \ - make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION} + # trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT + # docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver }} + # # try 3 times every 10 seconds to get the file, if success exit the loop + # for i in {1..3}; do + # docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break + # sleep 10 + # done + # - name: Build image + # env: + # IMAGE_NAME: ghcr.io/nvidia/driver + # VERSION: ${COMMIT_SHORT_SHA} + # PRECOMPILED: "true" + # DIST: signed_ubuntu22.04 + # run: | + # source kernel_version.txt && \ + # make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver }} build-${DIST}-${DRIVER_VERSION} diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index 4892ea17..5743ab8b 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -19,3 +19,10 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} + +: ${DAEMON_POD_STATUS_TIME_OUT:="1m"} +: ${POD_STATUS_TIME_OUT:="2m"} +# SHIVA +: ${MAX_POD_STATUS_CHECK_TOTAL_TIME:="5"} #30 minutes + +: ${LOG_DIR:="/tmp/logs"} diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh index 72658653..829794a0 100755 --- a/tests/scripts/checks.sh +++ b/tests/scripts/checks.sh @@ -2,35 +2,32 @@ check_pod_ready() { local pod_label=$1 - local current_time=0 - while :; do - echo "Checking $pod_label pod" - kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} + local pod_status_time_out=$2 + + echo "Checking $pod_label pod" + + kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} - echo "Checking $pod_label pod readiness" - is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated") + echo "Checking $pod_label pod readiness" - if [ "${is_pod_ready}" = "True" ]; then - # Check if the pod is not in terminating state - is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated") - if [ "${is_pod_terminating}" != "" ]; then - echo "pod $pod_label is in terminating state..." - else - echo "Pod $pod_label is ready" - break; - fi - fi + if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then + return 0 + else + # print status of pod + kubectl get pods -n ${TEST_NAMESPACE} + fi - if [[ "${current_time}" -gt $((60 * 45)) ]]; then - echo "timeout reached" - exit 1; - fi + return 1 +} - # Echo useful information on stdout - kubectl get pods -n ${TEST_NAMESPACE} +cleanup_pod(){ + local namespace=$1 + local release_name=$2 - echo "Sleeping 5 seconds" - current_time=$((${current_time} + 5)) - sleep 5 - done + kubectl delete daemonsets --all -n $namespace || true + kubectl delete deployments --all -n $namespace || true + kubectl delete services --all -n $namespace || true + kubectl delete pods --all -n $namespace || true + kubectl delete namespace $namespace || true + helm uninstall $release_name --namespace $namespace || true } diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index e87bdba9..9d0f3656 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -7,7 +7,6 @@ echo "" echo "" echo "--------------Installing the GPU Operator--------------" -# Install the operator with usePrecompiled mode set to true ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh diff --git a/tests/scripts/pull.sh b/tests/scripts/pull.sh new file mode 100755 index 00000000..ab5a0f6a --- /dev/null +++ b/tests/scripts/pull.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +if [[ $# -ne 2 ]]; then + echo "Pull requires a source and destination" + exit 1 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2} diff --git a/tests/scripts/remote.sh b/tests/scripts/remote.sh index 40c45e8f..dcd8cf9c 100755 --- a/tests/scripts/remote.sh +++ b/tests/scripts/remote.sh @@ -4,4 +4,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh -ssh -i ${private_key} ${instance_hostname} "${@}" +# keep alive 60sec and timeout after 30 tries +ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}" diff --git a/tests/scripts/verify-operator.sh b/tests/scripts/verify-operator.sh index 8836d560..ff056c11 100755 --- a/tests/scripts/verify-operator.sh +++ b/tests/scripts/verify-operator.sh @@ -11,9 +11,45 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh -check_pod_ready "nvidia-driver-daemonset" -check_pod_ready "nvidia-container-toolkit-daemonset" -check_pod_ready "nvidia-device-plugin-daemonset" -check_pod_ready "nvidia-dcgm-exporter" -check_pod_ready "gpu-feature-discovery" -check_pod_ready "nvidia-operator-validator" +# wait for the nvidia-driver pod to be ready +# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy) +# collect log in case of failure +start_time=$(date +%s) +log_dir=${LOG_DIR} +mkdir -p ${log_dir} +while :; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + # Check if total elapsed time is greater than exit + if [ $elapsed_time -gt $MAX_POD_STATUS_CHECK_TOTAL_TIME ]; then + echo "Total wait time exceeded ${MAX_POD_STATUS_CHECK_TOTAL_TIME} seconds. Exiting..." + cleanup_pod ${TEST_NAMESPACE} "gpu-operator" + exit 1 + fi + + check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \ + check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}; status=$? + + if [ $status -ne 0 ]; then + not_ready_pod_status=$(kubectl get pods -n $TEST_NAMESPACE -o jsonpath="{.items[?(@.status.phase != 'Running')].metadata.name}") + echo "SHIVA $not_ready_pod_status" + if [ -n "$not_ready_pod_status" ]; then + echo "SHIVA === " + for pod in $not_ready_pod_status; do + echo "SHIVA ===### $pod" + echo "Collecting logs for pod: $pod in dir ${log_dir}" + echo "------------------------------------------------" >> "${log_dir}/${pod}.describe" + kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe" 2>>/dev/null || true + kubectl logs $pod -n ${TEST_NAMESPACE} --all-containers=true >> "${log_dir}/${pod}_logs.txt" 2>>/dev/null || true + echo "Logs saved to ${log_dir}/${pod}_logs.txt" + done + fi + else + echo "All gpu-operator pods are ready." + cleanup_pod ${TEST_NAMESPACE} "gpu-operator" + break; + fi + + sleep 10 +done