From 94d9453cbe7930af707c5b9cff726b191dc55813 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Fri, 23 Aug 2024 22:04:47 +0530 Subject: [PATCH] Pre-compiled end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 10 +++- tests/scripts/remote_retry.sh | 94 ++++++++++++++++++++++++++++------- 2 files changed, 84 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 836a68a7..bb926855 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -60,17 +60,23 @@ jobs: id: get_public_dns_name uses: mikefarah/yq@master with: - cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml + + - name: Get instance id + id: get_instance_id + uses: mikefarah/yq@master + with: + cmd: yq '.status.properties[] | select(.name == "instance-id") | .value' /github/workspace/.cache/holodeck.yaml - name: Set and Calculate test vars run: | echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV + echo "instance_id=ubuntu@${{ steps.get_instance_id.outputs.result }}" >> $GITHUB_ENV echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}') echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV - cat ${{ github.workspace }}/.cache/holodeck.yaml exit 0 - name: Validate gpu driver diff --git a/tests/scripts/remote_retry.sh b/tests/scripts/remote_retry.sh index ff9073ad..e0aaf29b 100755 --- a/tests/scripts/remote_retry.sh +++ b/tests/scripts/remote_retry.sh @@ -4,25 +4,83 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh source ${SCRIPT_DIR}/.local.sh -try_ssh_connection() { +# working === +# try_ssh_connection() { +# status=0 +# ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" || status=$? +# return $status +# } + +# echo "Waiting for aws system to come back online..." +# START_TIME=$(date +%s) +# while true; do +# sleep 60 +# try_ssh_connection +# if [ $? -eq 0 ]; then +# echo "Successfully connected to aws system after reboot." +# break; +# fi +# ELAPSED_TIME=$(($(date +%s) - START_TIME)) +# if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then +# echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot." +# exit 1 +# fi +# echo "ssh retry again..." +# done + +install_aws() { status=0 - ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" || status=$? + sudo apt-get update + sudo apt-get install awscli -y || status=$? + if [ $status -ne 0 ]; then + sudo apt-get install unzip + curl "https://d1uj6qtbmh3dt5.cloudfront.net/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" || status=$? + unzip awscliv2.zip + sudo ./aws/install + fi return $status } -echo "Waiting for aws system to come back online..." -START_TIME=$(date +%s) -while true; do - try_ssh_connection - if [ $? -eq 0 ]; then - echo "Successfully connected to aws system after reboot." - break; - fi - ELAPSED_TIME=$(($(date +%s) - START_TIME)) - if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then - echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot." - exit 1 - fi - sleep 60 - echo "ssh retry again..." -done +elapsed_time() { + echo $(( $(date +%s) - start_time )) +} + +check_aws_instace_is_running() { + start_time=$(date +%s) + while true; do + echo "Waiting for instance $instance_id to be in 'running' state..." + if aws ec2 describe-instances --instance-ids "$instance_id" --query "Reservations[*].Instances[*].State.Name" --output text | grep -q "running"; then + echo "Instance $instance_id is running." + break + fi + if [ $(elapsed_time) -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then + echo "Timeout reached while waiting for aws instance to be 'running'." + exit 1 + fi + sleep 10 + done +} + +check_aws_instace_is_ok() { + start_time=$(date +%s) + while true; do + echo "Waiting for instance $instance_id status checks to be 'ok'..." + status=$(aws ec2 describe-instance-status --instance-ids "$instance_id" --query "InstanceStatuses[*].{SystemStatus: SystemStatus.Status, InstanceStatus: InstanceStatus.Status}" --output text) + if echo "$status" | grep -q "ok"; then + echo "Instance $instance_id status checks are 'ok'." + break + fi + if [ $(elapsed_time) -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then + echo "Timeout reached while waiting for aws instance status checks to be 'ok'." + exit 1 + fi + sleep 10 + done +} + +install_aws + +#Wait until the instance is in 'running' state, Wait until both status checks are 'ok' +check_aws_instace_is_running && check_aws_instace_is_ok + +echo "Instance aws $instance_id is running and ready."