From c6f8865db86e9c7c1131ca9b51171146d408cb71 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Fri, 16 Aug 2024 15:51:30 +0530 Subject: [PATCH] end-to-end gpu driver testing enhancement Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 31 +++++++++++++----- tests/scripts/.definitions.sh | 5 +++ tests/scripts/checks.sh | 39 +++++++---------------- tests/scripts/end-to-end-nvidia-driver.sh | 6 +++- tests/scripts/pull.sh | 12 +++++++ tests/scripts/remote.sh | 3 +- tests/scripts/uninstall-operator.sh | 14 ++++++++ tests/scripts/verify-operator.sh | 20 ++++++++---- 8 files changed, 87 insertions(+), 43 deletions(-) create mode 100755 tests/scripts/pull.sh create mode 100755 tests/scripts/uninstall-operator.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cc2ce1f8..e44be0ce 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: CI +name: End-to-end tests on: workflow_run: @@ -25,11 +25,6 @@ on: jobs: e2e-tests-nvidiadriver: runs-on: ubuntu-latest - strategy: - matrix: - driver: - - 535.183.06 - - 550.90.07 steps: - name: Check out code @@ -41,7 +36,6 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} - AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} with: aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -59,6 +53,8 @@ jobs: echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - name: Validate gpu driver env: @@ -66,4 +62,23 @@ jobs: run: | sudo chmod 644 ${{ github.workspace }}/.cache/key echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }} + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $driver_version with status $status" + rc=$status + fi + done + source ./tests/scripts/.definitions.sh + ./tests/scripts/pull.sh ${LOG_DIR} logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index 4892ea17..f254bc00 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} + +: ${DAEMON_POD_STATUS_TIME_OUT:="15m"} +: ${POD_STATUS_TIME_OUT:="2m"} + +: ${LOG_DIR:="/tmp/logs"} diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh index 72658653..c30b2d4f 100755 --- a/tests/scripts/checks.sh +++ b/tests/scripts/checks.sh @@ -2,35 +2,20 @@ check_pod_ready() { local pod_label=$1 - local current_time=0 - while :; do - echo "Checking $pod_label pod" - kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} + local pod_status_time_out=$2 + + echo "Checking $pod_label pod" + + kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} - echo "Checking $pod_label pod readiness" - is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated") + echo "Checking $pod_label pod readiness" - if [ "${is_pod_ready}" = "True" ]; then - # Check if the pod is not in terminating state - is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated") - if [ "${is_pod_terminating}" != "" ]; then - echo "pod $pod_label is in terminating state..." - else - echo "Pod $pod_label is ready" - break; - fi - fi - - if [[ "${current_time}" -gt $((60 * 45)) ]]; then - echo "timeout reached" - exit 1; - fi - - # Echo useful information on stdout + if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then + return 0 + else + # print status of pod kubectl get pods -n ${TEST_NAMESPACE} + fi - echo "Sleeping 5 seconds" - current_time=$((${current_time} + 5)) - sleep 5 - done + return 1 } diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index e87bdba9..d272efab 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -7,8 +7,12 @@ echo "" echo "" echo "--------------Installing the GPU Operator--------------" -# Install the operator with usePrecompiled mode set to true ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh + +echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" + +${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" + echo "--------------Verification completed for GPU Operator--------------" diff --git a/tests/scripts/pull.sh b/tests/scripts/pull.sh new file mode 100755 index 00000000..ab5a0f6a --- /dev/null +++ b/tests/scripts/pull.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +if [[ $# -ne 2 ]]; then + echo "Pull requires a source and destination" + exit 1 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2} diff --git a/tests/scripts/remote.sh b/tests/scripts/remote.sh index 40c45e8f..dcd8cf9c 100755 --- a/tests/scripts/remote.sh +++ b/tests/scripts/remote.sh @@ -4,4 +4,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh -ssh -i ${private_key} ${instance_hostname} "${@}" +# keep alive 60sec and timeout after 30 tries +ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}" diff --git a/tests/scripts/uninstall-operator.sh b/tests/scripts/uninstall-operator.sh new file mode 100755 index 00000000..8c45132a --- /dev/null +++ b/tests/scripts/uninstall-operator.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +namespace=$1 +release_name=$2 +helm uninstall $release_name --namespace $namespace || true +kubectl delete namespace $namespace || true diff --git a/tests/scripts/verify-operator.sh b/tests/scripts/verify-operator.sh index 8836d560..d663db37 100755 --- a/tests/scripts/verify-operator.sh +++ b/tests/scripts/verify-operator.sh @@ -11,9 +11,17 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh -check_pod_ready "nvidia-driver-daemonset" -check_pod_ready "nvidia-container-toolkit-daemonset" -check_pod_ready "nvidia-device-plugin-daemonset" -check_pod_ready "nvidia-dcgm-exporter" -check_pod_ready "gpu-feature-discovery" -check_pod_ready "nvidia-operator-validator" +# wait for the nvidia-driver pod to be ready +# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy) +# collect log in case of failure +check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \ + check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}; exit_status=$? +if [ $exit_status -ne 0 ]; then + curl -o ${SCRIPT_DIR}/must-gather.sh "https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh" + chmod +x ${SCRIPT_DIR}/must-gather.sh + ARTIFACT_DIR="${LOG_DIR}" ${SCRIPT_DIR}/must-gather.sh + ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" + exit 1 +else + echo "All gpu-operator pods are ready." +fi