Execute ZKVM-Perf (Matrix) #66
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Execute ZKVM-Perf (Matrix) | |
on: | |
workflow_dispatch: | |
inputs: | |
provers: | |
description: 'Provers to use (comma-separated)' | |
required: false | |
type: string | |
default: 'sp1' | |
programs: | |
description: 'Programs to benchmark (comma-separated)' | |
required: false | |
type: string | |
default: 'loop,fibonacci,tendermint,reth1,reth2' | |
filename: | |
description: 'Filename for the benchmark' | |
required: false | |
type: string | |
default: 'benchmark' | |
trials: | |
description: 'Number of trials to run' | |
required: false | |
type: string | |
default: '1' | |
sp1_ref: | |
description: 'SP1 reference (commit hash or branch name)' | |
required: false | |
type: string | |
default: 'dev' | |
additional_params: | |
description: 'Additional parameters as JSON' | |
required: false | |
type: string | |
default: '{"hashfns":"poseidon","shard_sizes":"22"}' | |
jobs: | |
run-benchmarks: | |
strategy: | |
matrix: | |
include: | |
- instance_type: g6.16xlarge | |
enable_gpu: true | |
ami_id: ami-079a6a210557ef0e4 | |
- instance_type: r7i.16xlarge | |
enable_gpu: false | |
ami_id: ami-079a6a210557ef0e4 | |
fail-fast: false # This prevents the entire matrix from failing if one job fails | |
name: Run on ${{ matrix.instance_type }} | |
runs-on: ubuntu-latest | |
continue-on-error: true # This allows the workflow to continue even if this job fails | |
steps: | |
- name: Echo Workflow Inputs | |
run: | | |
echo "Provers: ${{ inputs.provers }}" | |
echo "Programs: ${{ inputs.programs }}" | |
echo "Filename: ${{ inputs.filename }}" | |
echo "Trials: ${{ inputs.trials }}" | |
echo "SP1 Reference: ${{ inputs.sp1_ref }}" | |
echo "Additional Parameters: ${{ inputs.additional_params }}" | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ secrets.AWS_REGION }} | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: xJonathanLEI/ec2-github-runner@main | |
with: | |
mode: start | |
github-token: ${{ secrets.GH_PAT }} | |
ec2-image-id: ${{ matrix.ami_id }} | |
ec2-instance-type: ${{ matrix.instance_type }} | |
subnet-id: ${{ secrets.AWS_SUBNET_ID }} | |
security-group-id: ${{ secrets.AWS_SG_ID }} | |
storage-size: 1024 | |
aws-resource-tags: > | |
[ | |
{"Key": "Name", "Value": "ec2-github-runner"}, | |
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"} | |
] | |
- name: Run benchmarks | |
id: run-benchmarks | |
uses: actions/github-script@v6 | |
with: | |
github-token: ${{ secrets.GH_PAT }} | |
script: | | |
const runnerName = '${{ steps.start-ec2-runner.outputs.label }}'; | |
const maxAttempts = 30; | |
const pollInterval = 10000; // 10 seconds | |
let triggeredRunId = null; | |
console.log('Triggering benchmark workflow'); | |
try { | |
await github.rest.actions.createWorkflowDispatch({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
workflow_id: 'run-on-runner.yml', | |
ref: context.ref, | |
inputs: { | |
runner_name: runnerName, | |
instance_type: '${{ matrix.instance_type }}', | |
enable_gpu: '${{ matrix.enable_gpu }}', | |
provers: '${{ inputs.provers }}', | |
programs: '${{ inputs.programs }}', | |
filename: '${{ inputs.filename }}_${{ matrix.instance_type }}', | |
trials: '${{ inputs.trials }}', | |
sp1_ref: '${{ inputs.sp1_ref }}', | |
additional_params: '${{ inputs.additional_params }}' | |
} | |
}); | |
console.log('Benchmark workflow triggered successfully'); | |
} catch (error) { | |
console.error(`Failed to trigger workflow: ${error.message}`); | |
return; | |
} | |
console.log('Polling for the triggered run'); | |
for (let attempt = 1; attempt <= maxAttempts; attempt++) { | |
await new Promise(resolve => setTimeout(resolve, pollInterval)); | |
try { | |
const runs = await github.rest.actions.listWorkflowRuns({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
workflow_id: 'run-on-runner.yml', | |
status: 'in_progress' | |
}); | |
console.log(`Found ${runs.data.workflow_runs.length} in-progress runs`); | |
for (const run of runs.data.workflow_runs) { | |
if (new Date(run.created_at).getTime() > Date.now() - 300000) { // Within last 5 minutes | |
console.log(`Checking run ${run.id} created at ${run.created_at}`); | |
try { | |
const jobs = await github.rest.actions.listJobsForWorkflowRun({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
run_id: run.id | |
}); | |
console.log(`Run ${run.id} has ${jobs.data.jobs.length} jobs`); | |
for (const job of jobs.data.jobs) { | |
console.log(` Job: ${job.name}`); | |
} | |
const matchingJob = jobs.data.jobs.find(job => | |
job.name === `Run Benchmark on ${runnerName}` | |
); | |
if (matchingJob) { | |
triggeredRunId = run.id; | |
console.log(`Found matching run. Triggered run ID: ${triggeredRunId}`); | |
break; | |
} else { | |
console.log(`No matching job found for run ${run.id}`); | |
} | |
} catch (error) { | |
console.log(`Error checking jobs for run ${run.id}: ${error.message}`); | |
continue; | |
} | |
} else { | |
console.log(`Skipping run ${run.id} as it's older than 5 minutes`); | |
} | |
} | |
if (triggeredRunId) break; | |
console.log(`Attempt ${attempt}: Matching run not found yet. Continuing to poll...`); | |
} catch (error) { | |
console.log(`Error while polling: ${error.message}`); | |
} | |
} | |
if (!triggeredRunId) { | |
console.error('Failed to find the triggered workflow run with matching job after maximum attempts'); | |
return; | |
} | |
core.setOutput('triggered_run_id', triggeredRunId); | |
core.exportVariable('TRIGGERED_RUN_ID', triggeredRunId); | |
console.log(`Triggered run ID: ${triggeredRunId}`); | |
- name: Wait for benchmark completion | |
uses: actions/github-script@v6 | |
with: | |
github-token: ${{ secrets.GH_PAT }} | |
script: | | |
const triggeredRunId = process.env.TRIGGERED_RUN_ID; | |
if (!triggeredRunId) { | |
console.error('No triggered run ID found'); | |
return; | |
} | |
// 10 hours in milliseconds | |
const maxWaitTime = 36000000; | |
// 3 minutes in milliseconds | |
const checkInterval = 180000; | |
const startTime = Date.now(); | |
const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${triggeredRunId}`; | |
console.log(`Waiting for benchmark job to complete. Job URL: ${runUrl}`); | |
while (true) { | |
const run = await github.rest.actions.getWorkflowRun({ | |
owner: context.repo.owner, | |
repo: context.repo.repo, | |
run_id: triggeredRunId | |
}); | |
if (run.data.status === 'completed') { | |
console.log(`Benchmark workflow completed with conclusion: ${run.data.conclusion}`); | |
if (run.data.conclusion !== 'success') { | |
console.warn(`Benchmark workflow failed with conclusion: ${run.data.conclusion}. Job URL: ${runUrl}`); | |
} | |
break; | |
} | |
if (Date.now() - startTime > maxWaitTime) { | |
console.error(`Benchmark workflow did not complete within the maximum wait time. Job URL: ${runUrl}`); | |
break; | |
} | |
console.log(`Waiting for benchmark to complete... Current status: ${run.data.status}. Job URL: ${runUrl}`); | |
await new Promise(resolve => setTimeout(resolve, checkInterval)); | |
} | |
- name: Stop EC2 runner | |
if: always() | |
uses: xJonathanLEI/ec2-github-runner@main | |
with: | |
mode: stop | |
github-token: ${{ secrets.GH_PAT }} | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
outputs: | |
triggered_run_id: ${{ steps.run-benchmarks.outputs.triggered_run_id }} |