AWS lambda for automatic whl metadata upload (#6159)

Write an AWS lambda for extracting and uploading the metadata file for pep658 for whls. I don't think we have terraform for the pytorch AWS account so I think this is the next best bet for having this in source control Do this instead of the way in pytorch/pytorch since this will apply to all packages in the index so we can also do it for domains and other packages without code duplication
pytorch · Jan 13, 2025 · d841d89 · d841d89
1 parent adf0025
commit d841d89
Show file tree

Hide file tree

Showing 8 changed files with 214 additions and 0 deletions.
diff --git a/.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml b/.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml
@@ -0,0 +1,48 @@
+name: Deploy whl_metadata_upload_pep658 to pytorch AWS account
+
+on:
+  pull_request:
+    paths:
+      - aws/lambda/whl_metadata_upload_pep658/**
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml
+      - aws/lambda/whl_metadata_upload_pep658/**
+
+defaults:
+  run:
+    working-directory: aws/lambda/whl_metadata_upload_pep658/
+
+jobs:
+  test:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+          cache: pip
+      - run: pip install -r requirements.txt
+      - run: python test_lambda_function.py
+
+  deploy:
+    needs: test
+    runs-on: ubuntu-22.04
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+          cache: pip
+      - name: configure aws credentials
+        uses: aws-actions/[email protected]
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_whl_metadata_upload_pep658
+          aws-region: us-east-1
+      - run: make deploy
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -38,6 +38,7 @@ include_patterns = [
     'torchci/**/*.py',
     'torchci/**/*.pyi',
     '.github/scripts/*.py',
+    'aws/lambda/whl_metadata_upload_pep658/**/*.py',
 ]
 command = [
     'python3',
@@ -259,6 +260,7 @@ code = 'UFMT'
 include_patterns = [
     'tools/torchci/**/*.py',
     '.github/scripts/*.py',
+    'aws/lambda/whl_metadata_upload_pep658/**/*.py',
 ]
 command = [
     'python3',

diff --git a/aws/lambda/whl_metadata_upload_pep658/Makefile b/aws/lambda/whl_metadata_upload_pep658/Makefile
@@ -0,0 +1,11 @@
+prepare: clean
+	mkdir -p ./packages
+	pip3 install --target ./packages -r requirements.txt
+	cd packages && zip -r ../whl_metadata_upload_pep658.zip .
+	zip -g whl_metadata_upload_pep658.zip lambda_function.py
+
+deploy: prepare
+	aws lambda update-function-code --function-name whl_metadata_upload_pep658 --zip-file fileb://whl_metadata_upload_pep658.zip
+
+clean:
+	rm -rf whl_metadata_upload_pep658.zip packages
diff --git a/aws/lambda/whl_metadata_upload_pep658/README.md b/aws/lambda/whl_metadata_upload_pep658/README.md
@@ -0,0 +1,23 @@
+This lambda is used on the pytorch AWS account to upload metadata files from whl
+to be used in [pep658].  They are then added to the index by
+[s3_management/manage.py][managepy].
+
+This account does not use terraform, so this is the source of truth for the
+code, and the configuration should be:
+* time limit: at least 30s?
+* ephemeral memory: at least size of the largest whl we want to upload metadata for
+* Triggers:
+  * s3: put object events from pytorch bucket with suffix `.whl`
+
+### Deployment
+
+A new version of the lambda can be deployed using `make deploy`.  It is also
+done automatically in CI in
+`.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml`.
+
+### Testing + Backfill
+
+Please see `test_lambda_function.py`.
+
+[pep658]: https://peps.python.org/pep-0658/
+[managepy]: https://github.com/pytorch/test-infra/blob/73eea9088162354f937230cb518f19f50f557062/s3_management/manage.py
diff --git a/aws/lambda/whl_metadata_upload_pep658/lambda_function.py b/aws/lambda/whl_metadata_upload_pep658/lambda_function.py
@@ -0,0 +1,55 @@
+import os
+import zipfile
+from functools import cache
+from typing import Any
+from urllib.parse import unquote
+
+import boto3  # type: ignore[import]
+from botocore import UNSIGNED
+from botocore.config import Config
+
+
+@cache
+def get_client(read_only: bool) -> Any:
+    if read_only:
+        return boto3.client("s3", config=Config(signature_version=UNSIGNED))
+    return boto3.client("s3")
+
+
+def upload_s3(bucket: str, key: str, filename: str, dry_run: bool) -> None:
+    print(f"Uploading to {bucket}/{key}")
+    if not dry_run:
+        get_client(False).upload_file(
+            filename,
+            bucket,
+            key,
+            ExtraArgs={"ChecksumAlgorithm": "sha256", "ACL": "public-read"},
+        )
+
+
+def lambda_handler(event: Any, context: Any, dry_run: bool = False) -> None:
+    zip_location = "/tmp/wheel.zip"
+    metadata_location = "/tmp/METADATA"
+    for record in event["Records"]:
+        bucket = record["s3"]["bucket"]["name"]
+        key = unquote(record["s3"]["object"]["key"])
+        if not key.endswith(".whl"):
+            print(f"Skipping {bucket}/{key} as it is not a wheel")
+            continue
+        print(f"Processing {bucket}/{key}")
+
+        if os.path.exists(zip_location):
+            os.remove(zip_location)
+
+        get_client(dry_run).download_file(bucket, key, zip_location)
+
+        if os.path.exists(metadata_location):
+            os.remove(metadata_location)
+
+        with zipfile.ZipFile(zip_location, "r") as zip_ref:
+            for filename in zip_ref.infolist():
+                if filename.filename.endswith(".dist-info/METADATA"):
+                    filename.filename = "METADATA"
+                    zip_ref.extract(filename, "/tmp")
+                    upload_s3(bucket, f"{key}.metadata", metadata_location, dry_run)
+                    break
diff --git a/aws/lambda/whl_metadata_upload_pep658/requirements.txt b/aws/lambda/whl_metadata_upload_pep658/requirements.txt
@@ -0,0 +1 @@
+boto3==1.35.96
diff --git a/aws/lambda/whl_metadata_upload_pep658/test_event.json b/aws/lambda/whl_metadata_upload_pep658/test_event.json
@@ -0,0 +1,14 @@
+{
+  "Records": [
+    {
+      "s3": {
+        "bucket": {
+          "name": "pytorch"
+        },
+        "object": {
+          "key": "whl/cpu_pypi_pkg/torch_no_python-2.6.0.dev20240914+cpu-py3-none-any.whl"
+        }
+      }
+    }
+  ]
+}
diff --git a/aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py b/aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py
@@ -0,0 +1,60 @@
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Generator
+
+from lambda_function import get_client, lambda_handler
+
+GENERATE_EVENT_HELP_TEXT = """
+Generate an test_event.json for all files in this s3 path and test the lambda
+function with this new test_event.json. The test_event.json does not have
+complete data, only known attributes that are needed for the lambda function.
+Format should be `<bucket>/<key prefix>`, ex `pytorch/whl/nightly`.  Note that
+you will need more permissions to list objects in the bucket.
+"""
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    # Default to dry run (not uploading)
+    parser.add_argument("--no-dry-run", action="store_true")
+    parser.add_argument(
+        "--generate-event",
+        metavar="BUCKET/KEY_PREFIX",
+        type=str,
+        help=GENERATE_EVENT_HELP_TEXT,
+    )
+    return parser.parse_args()
+
+
+def get_all_keys(bucket: str, key_prefix: str) -> Generator[str, None, None]:
+    paginator = get_client(False).get_paginator("list_objects_v2")
+    for page in paginator.paginate(Bucket=bucket, Prefix=key_prefix):
+        for obj in page["Contents"]:
+            if obj["Key"].endswith(".whl"):
+                yield obj["Key"]
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    test_file = Path(__file__).parent / "test_event.json"
+
+    with open(test_file) as f:
+        event = json.load(f)
+    if args.generate_event:
+        bucket = args.generate_event.split("/")[0]
+        key = args.generate_event[len(bucket) + 1 :]
+
+        event["Records"] = [
+            {
+                "s3": {
+                    "bucket": {"name": bucket},
+                    "object": {"key": key},
+                }
+            }
+            for key in get_all_keys(bucket, key)
+        ]
+        json.dump(event, open(test_file, "w"), indent=2)
+
+    lambda_handler(event, None, dry_run=not args.no_dry_run)