From d841d891cf1501bd1312160edf8b28e1aed460ab Mon Sep 17 00:00:00 2001 From: clee2000 <44682903+clee2000@users.noreply.github.com> Date: Mon, 13 Jan 2025 13:26:22 -0800 Subject: [PATCH] AWS lambda for automatic whl metadata upload (#6159) Write an AWS lambda for extracting and uploading the metadata file for pep658 for whls. I don't think we have terraform for the pytorch AWS account so I think this is the next best bet for having this in source control Do this instead of the way in pytorch/pytorch since this will apply to all packages in the index so we can also do it for domains and other packages without code duplication --- ...ploy_lambda_whl_metadata_upload_pep658.yml | 48 +++++++++++++++ .lintrunner.toml | 2 + .../whl_metadata_upload_pep658/Makefile | 11 ++++ .../whl_metadata_upload_pep658/README.md | 23 +++++++ .../lambda_function.py | 55 +++++++++++++++++ .../requirements.txt | 1 + .../test_event.json | 14 +++++ .../test_lambda_function.py | 60 +++++++++++++++++++ 8 files changed, 214 insertions(+) create mode 100644 .github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml create mode 100644 aws/lambda/whl_metadata_upload_pep658/Makefile create mode 100644 aws/lambda/whl_metadata_upload_pep658/README.md create mode 100644 aws/lambda/whl_metadata_upload_pep658/lambda_function.py create mode 100644 aws/lambda/whl_metadata_upload_pep658/requirements.txt create mode 100644 aws/lambda/whl_metadata_upload_pep658/test_event.json create mode 100644 aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py diff --git a/.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml b/.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml new file mode 100644 index 0000000000..758dab44c5 --- /dev/null +++ b/.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml @@ -0,0 +1,48 @@ +name: Deploy whl_metadata_upload_pep658 to pytorch AWS account + +on: + pull_request: + paths: + - aws/lambda/whl_metadata_upload_pep658/** + push: + branches: + - main + paths: + - .github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml + - aws/lambda/whl_metadata_upload_pep658/** + +defaults: + run: + working-directory: aws/lambda/whl_metadata_upload_pep658/ + +jobs: + test: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: pip + - run: pip install -r requirements.txt + - run: python test_lambda_function.py + + deploy: + needs: test + runs-on: ubuntu-22.04 + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.13' + cache: pip + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v1.7.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_whl_metadata_upload_pep658 + aws-region: us-east-1 + - run: make deploy diff --git a/.lintrunner.toml b/.lintrunner.toml index 740581fd69..aec82862b8 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -38,6 +38,7 @@ include_patterns = [ 'torchci/**/*.py', 'torchci/**/*.pyi', '.github/scripts/*.py', + 'aws/lambda/whl_metadata_upload_pep658/**/*.py', ] command = [ 'python3', @@ -259,6 +260,7 @@ code = 'UFMT' include_patterns = [ 'tools/torchci/**/*.py', '.github/scripts/*.py', + 'aws/lambda/whl_metadata_upload_pep658/**/*.py', ] command = [ 'python3', diff --git a/aws/lambda/whl_metadata_upload_pep658/Makefile b/aws/lambda/whl_metadata_upload_pep658/Makefile new file mode 100644 index 0000000000..a8d0209d71 --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/Makefile @@ -0,0 +1,11 @@ +prepare: clean + mkdir -p ./packages + pip3 install --target ./packages -r requirements.txt + cd packages && zip -r ../whl_metadata_upload_pep658.zip . + zip -g whl_metadata_upload_pep658.zip lambda_function.py + +deploy: prepare + aws lambda update-function-code --function-name whl_metadata_upload_pep658 --zip-file fileb://whl_metadata_upload_pep658.zip + +clean: + rm -rf whl_metadata_upload_pep658.zip packages diff --git a/aws/lambda/whl_metadata_upload_pep658/README.md b/aws/lambda/whl_metadata_upload_pep658/README.md new file mode 100644 index 0000000000..73efe7164c --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/README.md @@ -0,0 +1,23 @@ +This lambda is used on the pytorch AWS account to upload metadata files from whl +to be used in [pep658]. They are then added to the index by +[s3_management/manage.py][managepy]. + +This account does not use terraform, so this is the source of truth for the +code, and the configuration should be: +* time limit: at least 30s? +* ephemeral memory: at least size of the largest whl we want to upload metadata for +* Triggers: + * s3: put object events from pytorch bucket with suffix `.whl` + +### Deployment + +A new version of the lambda can be deployed using `make deploy`. It is also +done automatically in CI in +`.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml`. + +### Testing + Backfill + +Please see `test_lambda_function.py`. + +[pep658]: https://peps.python.org/pep-0658/ +[managepy]: https://github.com/pytorch/test-infra/blob/73eea9088162354f937230cb518f19f50f557062/s3_management/manage.py diff --git a/aws/lambda/whl_metadata_upload_pep658/lambda_function.py b/aws/lambda/whl_metadata_upload_pep658/lambda_function.py new file mode 100644 index 0000000000..5b1520d041 --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/lambda_function.py @@ -0,0 +1,55 @@ +import os +import zipfile +from functools import cache +from typing import Any +from urllib.parse import unquote + +import boto3 # type: ignore[import] +from botocore import UNSIGNED +from botocore.config import Config + + +@cache +def get_client(read_only: bool) -> Any: + if read_only: + return boto3.client("s3", config=Config(signature_version=UNSIGNED)) + return boto3.client("s3") + + +def upload_s3(bucket: str, key: str, filename: str, dry_run: bool) -> None: + print(f"Uploading to {bucket}/{key}") + if not dry_run: + get_client(False).upload_file( + filename, + bucket, + key, + ExtraArgs={"ChecksumAlgorithm": "sha256", "ACL": "public-read"}, + ) + + +def lambda_handler(event: Any, context: Any, dry_run: bool = False) -> None: + zip_location = "/tmp/wheel.zip" + metadata_location = "/tmp/METADATA" + for record in event["Records"]: + bucket = record["s3"]["bucket"]["name"] + key = unquote(record["s3"]["object"]["key"]) + if not key.endswith(".whl"): + print(f"Skipping {bucket}/{key} as it is not a wheel") + continue + print(f"Processing {bucket}/{key}") + + if os.path.exists(zip_location): + os.remove(zip_location) + + get_client(dry_run).download_file(bucket, key, zip_location) + + if os.path.exists(metadata_location): + os.remove(metadata_location) + + with zipfile.ZipFile(zip_location, "r") as zip_ref: + for filename in zip_ref.infolist(): + if filename.filename.endswith(".dist-info/METADATA"): + filename.filename = "METADATA" + zip_ref.extract(filename, "/tmp") + upload_s3(bucket, f"{key}.metadata", metadata_location, dry_run) + break diff --git a/aws/lambda/whl_metadata_upload_pep658/requirements.txt b/aws/lambda/whl_metadata_upload_pep658/requirements.txt new file mode 100644 index 0000000000..83ec3c1fd2 --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/requirements.txt @@ -0,0 +1 @@ +boto3==1.35.96 diff --git a/aws/lambda/whl_metadata_upload_pep658/test_event.json b/aws/lambda/whl_metadata_upload_pep658/test_event.json new file mode 100644 index 0000000000..88db2e2355 --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/test_event.json @@ -0,0 +1,14 @@ +{ + "Records": [ + { + "s3": { + "bucket": { + "name": "pytorch" + }, + "object": { + "key": "whl/cpu_pypi_pkg/torch_no_python-2.6.0.dev20240914+cpu-py3-none-any.whl" + } + } + } + ] +} \ No newline at end of file diff --git a/aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py b/aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py new file mode 100644 index 0000000000..59ff2650f2 --- /dev/null +++ b/aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py @@ -0,0 +1,60 @@ +import argparse +import json +import os +from pathlib import Path +from typing import Generator + +from lambda_function import get_client, lambda_handler + +GENERATE_EVENT_HELP_TEXT = """ +Generate an test_event.json for all files in this s3 path and test the lambda +function with this new test_event.json. The test_event.json does not have +complete data, only known attributes that are needed for the lambda function. +Format should be `/`, ex `pytorch/whl/nightly`. Note that +you will need more permissions to list objects in the bucket. +""" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + # Default to dry run (not uploading) + parser.add_argument("--no-dry-run", action="store_true") + parser.add_argument( + "--generate-event", + metavar="BUCKET/KEY_PREFIX", + type=str, + help=GENERATE_EVENT_HELP_TEXT, + ) + return parser.parse_args() + + +def get_all_keys(bucket: str, key_prefix: str) -> Generator[str, None, None]: + paginator = get_client(False).get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=bucket, Prefix=key_prefix): + for obj in page["Contents"]: + if obj["Key"].endswith(".whl"): + yield obj["Key"] + + +if __name__ == "__main__": + args = parse_args() + test_file = Path(__file__).parent / "test_event.json" + + with open(test_file) as f: + event = json.load(f) + if args.generate_event: + bucket = args.generate_event.split("/")[0] + key = args.generate_event[len(bucket) + 1 :] + + event["Records"] = [ + { + "s3": { + "bucket": {"name": bucket}, + "object": {"key": key}, + } + } + for key in get_all_keys(bucket, key) + ] + json.dump(event, open(test_file, "w"), indent=2) + + lambda_handler(event, None, dry_run=not args.no_dry_run)