Skip to content

Commit

Permalink
AWS lambda for automatic whl metadata upload (#6159)
Browse files Browse the repository at this point in the history
Write an AWS lambda for extracting and uploading the metadata file for
pep658 for whls.

I don't think we have terraform for the pytorch AWS account so I think
this is the next best bet for having this in source control

Do this instead of the way in pytorch/pytorch since this will apply to
all packages in the index so we can also do it for domains and other
packages without code duplication
  • Loading branch information
clee2000 authored Jan 13, 2025
1 parent adf0025 commit d841d89
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 0 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: Deploy whl_metadata_upload_pep658 to pytorch AWS account

on:
pull_request:
paths:
- aws/lambda/whl_metadata_upload_pep658/**
push:
branches:
- main
paths:
- .github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml
- aws/lambda/whl_metadata_upload_pep658/**

defaults:
run:
working-directory: aws/lambda/whl_metadata_upload_pep658/

jobs:
test:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.13'
cache: pip
- run: pip install -r requirements.txt
- run: python test_lambda_function.py

deploy:
needs: test
runs-on: ubuntu-22.04
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.13'
cache: pip
- name: configure aws credentials
uses: aws-actions/[email protected]
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_whl_metadata_upload_pep658
aws-region: us-east-1
- run: make deploy
2 changes: 2 additions & 0 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ include_patterns = [
'torchci/**/*.py',
'torchci/**/*.pyi',
'.github/scripts/*.py',
'aws/lambda/whl_metadata_upload_pep658/**/*.py',
]
command = [
'python3',
Expand Down Expand Up @@ -259,6 +260,7 @@ code = 'UFMT'
include_patterns = [
'tools/torchci/**/*.py',
'.github/scripts/*.py',
'aws/lambda/whl_metadata_upload_pep658/**/*.py',
]
command = [
'python3',
Expand Down
11 changes: 11 additions & 0 deletions aws/lambda/whl_metadata_upload_pep658/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
prepare: clean
mkdir -p ./packages
pip3 install --target ./packages -r requirements.txt
cd packages && zip -r ../whl_metadata_upload_pep658.zip .
zip -g whl_metadata_upload_pep658.zip lambda_function.py

deploy: prepare
aws lambda update-function-code --function-name whl_metadata_upload_pep658 --zip-file fileb://whl_metadata_upload_pep658.zip

clean:
rm -rf whl_metadata_upload_pep658.zip packages
23 changes: 23 additions & 0 deletions aws/lambda/whl_metadata_upload_pep658/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
This lambda is used on the pytorch AWS account to upload metadata files from whl
to be used in [pep658]. They are then added to the index by
[s3_management/manage.py][managepy].

This account does not use terraform, so this is the source of truth for the
code, and the configuration should be:
* time limit: at least 30s?
* ephemeral memory: at least size of the largest whl we want to upload metadata for
* Triggers:
* s3: put object events from pytorch bucket with suffix `.whl`

### Deployment

A new version of the lambda can be deployed using `make deploy`. It is also
done automatically in CI in
`.github/workflows/deploy_lambda_whl_metadata_upload_pep658.yml`.

### Testing + Backfill

Please see `test_lambda_function.py`.

[pep658]: https://peps.python.org/pep-0658/
[managepy]: https://github.com/pytorch/test-infra/blob/73eea9088162354f937230cb518f19f50f557062/s3_management/manage.py
55 changes: 55 additions & 0 deletions aws/lambda/whl_metadata_upload_pep658/lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os
import zipfile
from functools import cache
from typing import Any
from urllib.parse import unquote

import boto3 # type: ignore[import]
from botocore import UNSIGNED
from botocore.config import Config


@cache
def get_client(read_only: bool) -> Any:
if read_only:
return boto3.client("s3", config=Config(signature_version=UNSIGNED))
return boto3.client("s3")


def upload_s3(bucket: str, key: str, filename: str, dry_run: bool) -> None:
print(f"Uploading to {bucket}/{key}")
if not dry_run:
get_client(False).upload_file(
filename,
bucket,
key,
ExtraArgs={"ChecksumAlgorithm": "sha256", "ACL": "public-read"},
)


def lambda_handler(event: Any, context: Any, dry_run: bool = False) -> None:
zip_location = "/tmp/wheel.zip"
metadata_location = "/tmp/METADATA"
for record in event["Records"]:
bucket = record["s3"]["bucket"]["name"]
key = unquote(record["s3"]["object"]["key"])
if not key.endswith(".whl"):
print(f"Skipping {bucket}/{key} as it is not a wheel")
continue
print(f"Processing {bucket}/{key}")

if os.path.exists(zip_location):
os.remove(zip_location)

get_client(dry_run).download_file(bucket, key, zip_location)

if os.path.exists(metadata_location):
os.remove(metadata_location)

with zipfile.ZipFile(zip_location, "r") as zip_ref:
for filename in zip_ref.infolist():
if filename.filename.endswith(".dist-info/METADATA"):
filename.filename = "METADATA"
zip_ref.extract(filename, "/tmp")
upload_s3(bucket, f"{key}.metadata", metadata_location, dry_run)
break
1 change: 1 addition & 0 deletions aws/lambda/whl_metadata_upload_pep658/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
boto3==1.35.96
14 changes: 14 additions & 0 deletions aws/lambda/whl_metadata_upload_pep658/test_event.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"Records": [
{
"s3": {
"bucket": {
"name": "pytorch"
},
"object": {
"key": "whl/cpu_pypi_pkg/torch_no_python-2.6.0.dev20240914+cpu-py3-none-any.whl"
}
}
}
]
}
60 changes: 60 additions & 0 deletions aws/lambda/whl_metadata_upload_pep658/test_lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import argparse
import json
import os
from pathlib import Path
from typing import Generator

from lambda_function import get_client, lambda_handler

GENERATE_EVENT_HELP_TEXT = """
Generate an test_event.json for all files in this s3 path and test the lambda
function with this new test_event.json. The test_event.json does not have
complete data, only known attributes that are needed for the lambda function.
Format should be `<bucket>/<key prefix>`, ex `pytorch/whl/nightly`. Note that
you will need more permissions to list objects in the bucket.
"""


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
# Default to dry run (not uploading)
parser.add_argument("--no-dry-run", action="store_true")
parser.add_argument(
"--generate-event",
metavar="BUCKET/KEY_PREFIX",
type=str,
help=GENERATE_EVENT_HELP_TEXT,
)
return parser.parse_args()


def get_all_keys(bucket: str, key_prefix: str) -> Generator[str, None, None]:
paginator = get_client(False).get_paginator("list_objects_v2")
for page in paginator.paginate(Bucket=bucket, Prefix=key_prefix):
for obj in page["Contents"]:
if obj["Key"].endswith(".whl"):
yield obj["Key"]


if __name__ == "__main__":
args = parse_args()
test_file = Path(__file__).parent / "test_event.json"

with open(test_file) as f:
event = json.load(f)
if args.generate_event:
bucket = args.generate_event.split("/")[0]
key = args.generate_event[len(bucket) + 1 :]

event["Records"] = [
{
"s3": {
"bucket": {"name": bucket},
"object": {"key": key},
}
}
for key in get_all_keys(bucket, key)
]
json.dump(event, open(test_file, "w"), indent=2)

lambda_handler(event, None, dry_run=not args.no_dry_run)

0 comments on commit d841d89

Please sign in to comment.