Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 23, 2024
1 parent f7ca2f8 commit 94d9453
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 20 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,23 @@ jobs:
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Get instance id
id: get_instance_id
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "instance-id") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "instance_id=ubuntu@${{ steps.get_instance_id.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
cat ${{ github.workspace }}/.cache/holodeck.yaml
exit 0
- name: Validate gpu driver
Expand Down
94 changes: 76 additions & 18 deletions tests/scripts/remote_retry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,83 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

try_ssh_connection() {
# working ===
# try_ssh_connection() {
# status=0
# ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" || status=$?
# return $status
# }

# echo "Waiting for aws system to come back online..."
# START_TIME=$(date +%s)
# while true; do
# sleep 60
# try_ssh_connection
# if [ $? -eq 0 ]; then
# echo "Successfully connected to aws system after reboot."
# break;
# fi
# ELAPSED_TIME=$(($(date +%s) - START_TIME))
# if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
# echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot."
# exit 1
# fi
# echo "ssh retry again..."
# done

install_aws() {
status=0
ssh -o ConnectTimeout=10 -i ${private_key} ${instance_hostname} "exit" || status=$?
sudo apt-get update
sudo apt-get install awscli -y || status=$?
if [ $status -ne 0 ]; then
sudo apt-get install unzip
curl "https://d1uj6qtbmh3dt5.cloudfront.net/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" || status=$?
unzip awscliv2.zip
sudo ./aws/install
fi
return $status
}

echo "Waiting for aws system to come back online..."
START_TIME=$(date +%s)
while true; do
try_ssh_connection
if [ $? -eq 0 ]; then
echo "Successfully connected to aws system after reboot."
break;
fi
ELAPSED_TIME=$(($(date +%s) - START_TIME))
if [ "$ELAPSED_TIME" -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Failed to connect to aws within ${SYSTEM_ONLINE_CHECK_TIMEOUT} minutes after reboot."
exit 1
fi
sleep 60
echo "ssh retry again..."
done
elapsed_time() {
echo $(( $(date +%s) - start_time ))
}

check_aws_instace_is_running() {
start_time=$(date +%s)
while true; do
echo "Waiting for instance $instance_id to be in 'running' state..."
if aws ec2 describe-instances --instance-ids "$instance_id" --query "Reservations[*].Instances[*].State.Name" --output text | grep -q "running"; then
echo "Instance $instance_id is running."
break
fi
if [ $(elapsed_time) -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Timeout reached while waiting for aws instance to be 'running'."
exit 1
fi
sleep 10
done
}

check_aws_instace_is_ok() {
start_time=$(date +%s)
while true; do
echo "Waiting for instance $instance_id status checks to be 'ok'..."
status=$(aws ec2 describe-instance-status --instance-ids "$instance_id" --query "InstanceStatuses[*].{SystemStatus: SystemStatus.Status, InstanceStatus: InstanceStatus.Status}" --output text)
if echo "$status" | grep -q "ok"; then
echo "Instance $instance_id status checks are 'ok'."
break
fi
if [ $(elapsed_time) -ge "$SYSTEM_ONLINE_CHECK_TIMEOUT" ]; then
echo "Timeout reached while waiting for aws instance status checks to be 'ok'."
exit 1
fi
sleep 10
done
}

install_aws

#Wait until the instance is in 'running' state, Wait until both status checks are 'ok'
check_aws_instace_is_running && check_aws_instace_is_ok

echo "Instance aws $instance_id is running and ready."

0 comments on commit 94d9453

Please sign in to comment.