diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cc2ce1f8..20b30335 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -name: CI +name: End-to-end tests on: workflow_run: @@ -21,15 +21,10 @@ on: - completed branches: - main - + jobs: e2e-tests-nvidiadriver: runs-on: ubuntu-latest - strategy: - matrix: - driver: - - 535.183.06 - - 550.90.07 steps: - name: Check out code @@ -41,7 +36,6 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }} - AWS_SESSION_TOKEN: ${{ secrets.AWS_SESSION_TOKEN }} with: aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} @@ -59,6 +53,8 @@ jobs: echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV + DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') + echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - name: Validate gpu driver env: @@ -66,4 +62,23 @@ jobs: run: | sudo chmod 644 ${{ github.workspace }}/.cache/key echo "${{ secrets.AWS_SSH_KEY }}" > ${private_key} && chmod 400 ${private_key} - ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${{ matrix.driver }} + rc=0 + for driver_version in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$driver_version" + ./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} || status=$? + if [ $status -ne 0 ]; then + echo "e2e validation failed for driver version $driver_version with status $status" + rc=$status + fi + done + source ./tests/scripts/.definitions.sh + ./tests/scripts/pull.sh ${LOG_DIR} logs + exit $rc + + - name: Archive test logs + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: nvidiadriver-e2e-test-logs + path: ./logs/ + retention-days: 15 diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index c167a324..56edf618 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -34,6 +34,7 @@ jobs: strategy: matrix: driver: + - 470.256.02 - 535.183.06 - 550.90.07 dist: @@ -55,6 +56,10 @@ jobs: - ispr: true dist: ubuntu20.04 driver: 550.90.07 + - driver: 470.256.02 + dist: ubuntu22.04 + - driver: 470.256.02 + dist: rhel9 fail-fast: false steps: - uses: actions/checkout@v4 diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index 4892ea17..f254bc00 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -19,3 +19,8 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} : ${TARGET_DRIVER_VERSION:="550.90.07"} + +: ${DAEMON_POD_STATUS_TIME_OUT:="15m"} +: ${POD_STATUS_TIME_OUT:="2m"} + +: ${LOG_DIR:="/tmp/logs"} diff --git a/tests/scripts/checks.sh b/tests/scripts/checks.sh index 72658653..c30b2d4f 100755 --- a/tests/scripts/checks.sh +++ b/tests/scripts/checks.sh @@ -2,35 +2,20 @@ check_pod_ready() { local pod_label=$1 - local current_time=0 - while :; do - echo "Checking $pod_label pod" - kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} + local pod_status_time_out=$2 + + echo "Checking $pod_label pod" + + kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} - echo "Checking $pod_label pod readiness" - is_pod_ready=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' 2>/dev/null || echo "terminated") + echo "Checking $pod_label pod readiness" - if [ "${is_pod_ready}" = "True" ]; then - # Check if the pod is not in terminating state - is_pod_terminating=$(kubectl get pods -lapp=$pod_label -n ${TEST_NAMESPACE} -o jsonpath='{.items[0].metadata.deletionGracePeriodSeconds}' 2>/dev/null || echo "terminated") - if [ "${is_pod_terminating}" != "" ]; then - echo "pod $pod_label is in terminating state..." - else - echo "Pod $pod_label is ready" - break; - fi - fi - - if [[ "${current_time}" -gt $((60 * 45)) ]]; then - echo "timeout reached" - exit 1; - fi - - # Echo useful information on stdout + if kubectl wait -n ${TEST_NAMESPACE} --for=condition=Ready pod -l app=$pod_label --timeout ${pod_status_time_out}; then + return 0 + else + # print status of pod kubectl get pods -n ${TEST_NAMESPACE} + fi - echo "Sleeping 5 seconds" - current_time=$((${current_time} + 5)) - sleep 5 - done + return 1 } diff --git a/tests/scripts/end-to-end-nvidia-driver.sh b/tests/scripts/end-to-end-nvidia-driver.sh index e87bdba9..d272efab 100755 --- a/tests/scripts/end-to-end-nvidia-driver.sh +++ b/tests/scripts/end-to-end-nvidia-driver.sh @@ -7,8 +7,12 @@ echo "" echo "" echo "--------------Installing the GPU Operator--------------" -# Install the operator with usePrecompiled mode set to true ${SCRIPT_DIR}/install-operator.sh "${SCRIPT_DIR}"/verify-operator.sh + +echo "--------------Verification completed for GPU Operator, uninstalling the operator--------------" + +${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" + echo "--------------Verification completed for GPU Operator--------------" diff --git a/tests/scripts/must-gather.sh b/tests/scripts/must-gather.sh new file mode 100755 index 00000000..c66b172d --- /dev/null +++ b/tests/scripts/must-gather.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash + +set -o nounset +set -x + +K=kubectl +if ! $K version > /dev/null; then + K=oc + + if ! $K version > /dev/null; then + echo "FATAL: neither 'kubectl' nor 'oc' appear to be working properly. Exiting ..." + exit 1 + fi +fi + +if [[ "$0" == "/usr/bin/gather" ]]; then + echo "Running as must-gather plugin image" + export ARTIFACT_DIR=/must-gather +else + if [ -z "${ARTIFACT_DIR:-}" ]; then + export ARTIFACT_DIR="/tmp/nvidia-gpu-operator_$(date +%Y%m%d_%H%M)" + fi + echo "Using ARTIFACT_DIR=$ARTIFACT_DIR" +fi + +mkdir -p "$ARTIFACT_DIR" + +echo + +exec 1> >(tee $ARTIFACT_DIR/must-gather.log) +exec 2> $ARTIFACT_DIR/must-gather.stderr.log + +if [[ "$0" == "/usr/bin/gather" ]]; then + echo "NVIDIA GPU Operator" > $ARTIFACT_DIR/version + echo "${VERSION:-N/A}" >> $ARTIFACT_DIR/version +fi + +ocp_cluster=$($K get clusterversion/version --ignore-not-found -oname || true) + +if [[ "$ocp_cluster" ]]; then + echo "Running in OpenShift." + echo "Get the cluster version" + $K get clusterversion/version -oyaml > $ARTIFACT_DIR/openshift_version.yaml +fi + +echo "Get the operator namespaces" +OPERATOR_POD_NAME=$($K get pods -lapp=gpu-operator -oname -A) + +if [ -z "$OPERATOR_POD_NAME" ]; then + echo "FATAL: could not find the GPU Operator Pod ..." + exit 1 +fi + +OPERATOR_NAMESPACE=$($K get pods -lapp=gpu-operator -A -ojsonpath={.items[].metadata.namespace} --ignore-not-found) + +echo "Using '$OPERATOR_NAMESPACE' as operator namespace" +echo "" + +echo "#" +echo "# ClusterPolicy" +echo "#" +echo + +CLUSTER_POLICY_NAME=$($K get clusterpolicy -oname) + +if [[ "$CLUSTER_POLICY_NAME" ]]; then + echo "Get $CLUSTER_POLICY_NAME" + $K get -oyaml $CLUSTER_POLICY_NAME > $ARTIFACT_DIR/cluster_policy.yaml +else + echo "Mark the ClusterPolicy as missing" + touch $ARTIFACT_DIR/cluster_policy.missing +fi + +echo +echo "#" +echo "# Nodes and machines" +echo "#" +echo + +if [ "$ocp_cluster" ]; then + echo "Get all the machines" + $K get machines -A > $ARTIFACT_DIR/all_machines.list +fi + +echo "Get the labels of the nodes with NVIDIA PCI cards" + +GPU_PCI_LABELS=(feature.node.kubernetes.io/pci-10de.present feature.node.kubernetes.io/pci-0302_10de.present feature.node.kubernetes.io/pci-0300_10de.present) + +gpu_pci_nodes="" +for label in ${GPU_PCI_LABELS[@]}; do + gpu_pci_nodes="$gpu_pci_nodes $($K get nodes -l$label -oname)" +done + +if [ -z "$gpu_pci_nodes" ]; then + echo "FATAL: could not find nodes with NVIDIA PCI labels" + exit 0 +fi + +for node in $(echo "$gpu_pci_nodes"); do + echo "$node" | cut -d/ -f2 >> $ARTIFACT_DIR/gpu_nodes.labels + $K get $node '-ojsonpath={.metadata.labels}' \ + | sed 's|,|,- |g' \ + | tr ',' '\n' \ + | sed 's/{"/- /' \ + | tr : = \ + | sed 's/"//g' \ + | sed 's/}/\n/' \ + >> $ARTIFACT_DIR/gpu_nodes.labels + echo "" >> $ARTIFACT_DIR/gpu_nodes.labels +done + +echo "Get the GPU nodes (status)" +$K get nodes -l nvidia.com/gpu.present=true -o wide > $ARTIFACT_DIR/gpu_nodes.status + +echo "Get the GPU nodes (description)" +$K describe nodes -l nvidia.com/gpu.present=true > $ARTIFACT_DIR/gpu_nodes.descr + +echo "" +echo "#" +echo "# Operator Pod" +echo "#" +echo + +echo "Get the GPU Operator Pod (status)" +$K get $OPERATOR_POD_NAME \ + -owide \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operator_pod.status + +echo "Get the GPU Operator Pod (yaml)" +$K get $OPERATOR_POD_NAME \ + -oyaml \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operator_pod.yaml + +echo "Get the GPU Operator Pod logs" +$K logs $OPERATOR_POD_NAME \ + -n $OPERATOR_NAMESPACE \ + > "$ARTIFACT_DIR/gpu_operator_pod.log" + +$K logs $OPERATOR_POD_NAME \ + -n $OPERATOR_NAMESPACE \ + --previous \ + > "$ARTIFACT_DIR/gpu_operator_pod.previous.log" + +echo "" +echo "#" +echo "# Operand Pods" +echo "#" +echo "" + +echo "Get the Pods in $OPERATOR_NAMESPACE (status)" +$K get pods -owide \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_pods.status + +echo "Get the Pods in $OPERATOR_NAMESPACE (yaml)" +$K get pods -oyaml \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_pods.yaml + +echo "Get the GPU Operator Pods Images" +$K get pods -n $OPERATOR_NAMESPACE \ + -o=jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}' \ + > $ARTIFACT_DIR/gpu_operand_pod_images.txt + +echo "Get the description and logs of the GPU Operator Pods" + +for pod in $($K get pods -n $OPERATOR_NAMESPACE -oname); +do + if ! $K get $pod -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then + echo "Skipping $pod, not a NVIDA/GPU Pod ..." + continue + fi + pod_name=$(echo "$pod" | cut -d/ -f2) + + if [ $pod == $OPERATOR_POD_NAME ]; then + echo "Skipping operator pod $pod_name ..." + continue + fi + + $K logs $pod \ + -n $OPERATOR_NAMESPACE \ + --all-containers --prefix \ + > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.log + + $K logs $pod \ + -n $OPERATOR_NAMESPACE \ + --all-containers --prefix \ + --previous \ + > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.previous.log + + $K describe $pod \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_pod_$pod_name.descr +done + +echo "" +echo "#" +echo "# Operand DaemonSets" +echo "#" +echo "" + +echo "Get the DaemonSets in $OPERATOR_NAMESPACE (status)" + +$K get ds \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_ds.status + + +echo "Get the DaemonSets in $OPERATOR_NAMESPACE (yaml)" + +$K get ds -oyaml \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_ds.yaml + +echo "Get the description of the GPU Operator DaemonSets" + +for ds in $($K get ds -n $OPERATOR_NAMESPACE -oname); +do + if ! $K get $ds -n $OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet '(nvidia|gpu)'; then + echo "Skipping $ds, not a NVIDA/GPU DaemonSet ..." + continue + fi + $K describe $ds \ + -n $OPERATOR_NAMESPACE \ + > $ARTIFACT_DIR/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr +done + +echo "" +echo "#" +echo "# nvidia-bug-report.sh" +echo "#" +echo "" + +for pod in $($K get pods -lopenshift.driver-toolkit -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-driver-daemonset -oname -n $OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n $OPERATOR_NAMESPACE); +do + pod_nodename=$($K get $pod -ojsonpath={.spec.nodeName} -n $OPERATOR_NAMESPACE) + echo "Saving nvidia-bug-report from ${pod_nodename} ..." + + $K exec -n $OPERATOR_NAMESPACE $pod -- bash -c 'cd /tmp && nvidia-bug-report.sh' >&2 || \ + (echo "Failed to collect nvidia-bug-report from ${pod_nodename}" && continue) + + $K cp $OPERATOR_NAMESPACE/$(basename $pod):/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \ + (echo "Failed to save nvidia-bug-report from ${pod_nodename}" && continue) + + mv /tmp/nvidia-bug-report.log.gz $ARTIFACT_DIR/nvidia-bug-report_${pod_nodename}.log.gz +done + +echo "" +echo "#" +echo "# All done!" +if [[ "$0" != "/usr/bin/gather" ]]; then + echo "# Logs saved into ${ARTIFACT_DIR}." +fi +echo "#" diff --git a/tests/scripts/pull.sh b/tests/scripts/pull.sh new file mode 100755 index 00000000..ab5a0f6a --- /dev/null +++ b/tests/scripts/pull.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +if [[ $# -ne 2 ]]; then + echo "Pull requires a source and destination" + exit 1 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh +source ${SCRIPT_DIR}/.local.sh + +${SCRIPT_DIR}/sync.sh ${instance_hostname}:${1} ${2} diff --git a/tests/scripts/remote.sh b/tests/scripts/remote.sh index 40c45e8f..dcd8cf9c 100755 --- a/tests/scripts/remote.sh +++ b/tests/scripts/remote.sh @@ -4,4 +4,5 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh -ssh -i ${private_key} ${instance_hostname} "${@}" +# keep alive 60sec and timeout after 30 tries +ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}" diff --git a/tests/scripts/uninstall-operator.sh b/tests/scripts/uninstall-operator.sh new file mode 100755 index 00000000..3a5fd7a0 --- /dev/null +++ b/tests/scripts/uninstall-operator.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +if [[ "${SKIP_INSTALL}" == "true" ]]; then + echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" + exit 0 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +source ${SCRIPT_DIR}/.definitions.sh + +namespace=$1 +release_name=$2 +kubectl delete daemonsets --all -n $namespace || true +kubectl delete deployments --all -n $namespace || true +kubectl delete services --all -n $namespace || true +kubectl delete pods --all -n $namespace || true +kubectl delete namespace $namespace || true +helm uninstall $release_name --namespace $namespace || true diff --git a/tests/scripts/verify-operator.sh b/tests/scripts/verify-operator.sh index 8836d560..e65c5a3e 100755 --- a/tests/scripts/verify-operator.sh +++ b/tests/scripts/verify-operator.sh @@ -11,9 +11,15 @@ source ${SCRIPT_DIR}/.definitions.sh # Import the check definitions source ${SCRIPT_DIR}/checks.sh -check_pod_ready "nvidia-driver-daemonset" -check_pod_ready "nvidia-container-toolkit-daemonset" -check_pod_ready "nvidia-device-plugin-daemonset" -check_pod_ready "nvidia-dcgm-exporter" -check_pod_ready "gpu-feature-discovery" -check_pod_ready "nvidia-operator-validator" +# wait for the nvidia-driver pod to be ready +# If successful, then wait for the validator pod to be ready (this means that the rest of the pods are healthy) +# collect log in case of failure +check_pod_ready "nvidia-driver-daemonset" ${DAEMON_POD_STATUS_TIME_OUT} && \ + check_pod_ready "nvidia-operator-validator" ${POD_STATUS_TIME_OUT}; exit_status=$? +if [ $exit_status -ne 0 ]; then + ARTIFACT_DIR="${LOG_DIR}" ${SCRIPT_DIR}/must-gather.sh + ${SCRIPT_DIR}/uninstall-operator.sh ${TEST_NAMESPACE} "gpu-operator" + exit 1 +else + echo "All gpu-operator pods are ready." +fi