Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Foundry client and server #61

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,24 @@ jobs:
name: Test with Python ${{ matrix.version }}
steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.version }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Build Foundry image
run: |
DOCKER_IMAGE=aurora-foundry make docker

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade --no-cache-dir -e '.[dev]'
python -m pip install --upgrade pip
python -m pip install --upgrade --no-cache-dir -e '.[dev]'

- name: Run tests
run: |
pytest -v --cov=aurora --cov-report term-missing
DOCKER_IMAGE=aurora-foundry pytest -v --cov=aurora --cov-report term-missing
45 changes: 45 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Use an official Python runtime as a parent image.
FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest

WORKDIR /aurora_foundry
COPY ./pyproject.toml .

# Assuming dependencies are fairly fixed, we can install them first and then copy the rest of the
# code to avoid re-installing dependencies when the code changes.
COPY _docker_requirements.txt .
RUN pip install --upgrade pip virtualenv && \
virtualenv venv -p python3.10 && \
. venv/bin/activate && \
pip install -r _docker_requirements.txt

# Download model weights.
RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \
./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-finetuned.ckpt")'

COPY ./LICENSE.txt .
COPY ./README.md .

# Install `azcopy` and the AML inference server.
RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \
cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy
RUN . ./venv/bin/activate && \
pip install azureml-inference-server-http

COPY ./aurora ./aurora
ARG AURORA_REPO_VERSION
RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \
. venv/bin/activate && \
SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e .

# Make port 5001 available to the world outside this container.
EXPOSE 5001
ENV PORT=5001

# we don't have a swagger2.json file, so we'll just "ignore" the version option and always return a version 3 file
RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger2.0.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.1.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.0.json && \
cp ./aurora/foundry/server/swagger3.json ./swagger3.json

CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"]
24 changes: 22 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,33 @@
.PHONY: install test docs
.PHONY: install test docs docker-requirements docker swagger-file

DOCKER_WS ?= testwsacr
DOCKER_IMAGE ?= aurora-foundry:20250110-1

install:
pip install --upgrade pip
pip install -e ".[dev]"
pre-commit install

test:
pytest tests -v --cov=aurora --cov-report=term --cov-report=html
DOCKER_IMAGE=$(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) pytest tests -v --cov=aurora --cov-report=term --cov-report=html

docs:
jupyter-book build docs
cp -r docs/_extras/* docs/_build/html/

docker-requirements: pyproject.toml
(pip show pip-tools 1>/dev/null) || pip install pip-tools
pip-compile --verbose --output-file _docker_requirements.txt pyproject.toml

docker:
(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t $(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) .

docker-acr:
(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
[ ! -z "$(ACR)" ]
AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t $(DOCKER_IMAGE) .

swagger-file:
pip install fastapi
python aurora/foundry/server/generate-swagger.py aurora/foundry/server/swagger3.json
137 changes: 137 additions & 0 deletions _docker_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile --output-file=requirements.txt pyproject.toml
#
annotated-types==0.7.0
# via pydantic
certifi==2024.12.14
# via
# netcdf4
# requests
cftime==1.6.4.post1
# via netcdf4
charset-normalizer==3.4.1
# via requests
einops==0.8.0
# via microsoft-aurora (pyproject.toml)
filelock==3.16.1
# via
# huggingface-hub
# torch
# triton
fsspec==2024.12.0
# via
# huggingface-hub
# torch
huggingface-hub==0.27.1
# via
# microsoft-aurora (pyproject.toml)
# timm
idna==3.10
# via requests
jinja2==3.1.5
# via torch
markupsafe==3.0.2
# via jinja2
mpmath==1.3.0
# via sympy
netcdf4==1.7.2
# via microsoft-aurora (pyproject.toml)
networkx==3.4.2
# via torch
numpy==2.2.1
# via
# cftime
# microsoft-aurora (pyproject.toml)
# netcdf4
# pandas
# scipy
# torchvision
# xarray
nvidia-cublas-cu12==12.4.5.8
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.4.127
# via torch
nvidia-cuda-nvrtc-cu12==12.4.127
# via torch
nvidia-cuda-runtime-cu12==12.4.127
# via torch
nvidia-cudnn-cu12==9.1.0.70
# via torch
nvidia-cufft-cu12==11.2.1.3
# via torch
nvidia-curand-cu12==10.3.5.147
# via torch
nvidia-cusolver-cu12==11.6.1.9
# via torch
nvidia-cusparse-cu12==12.3.1.170
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.21.5
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.4.127
# via torch
packaging==24.2
# via
# huggingface-hub
# xarray
pandas==2.2.3
# via xarray
pillow==11.1.0
# via torchvision
pydantic==2.10.4
# via microsoft-aurora (pyproject.toml)
pydantic-core==2.27.2
# via pydantic
python-dateutil==2.9.0.post0
# via pandas
pytz==2024.2
# via pandas
pyyaml==6.0.2
# via
# huggingface-hub
# timm
requests==2.32.3
# via huggingface-hub
scipy==1.15.0
# via microsoft-aurora (pyproject.toml)
six==1.17.0
# via python-dateutil
sympy==1.13.1
# via torch
timm==0.6.13
# via microsoft-aurora (pyproject.toml)
torch==2.5.1
# via
# microsoft-aurora (pyproject.toml)
# timm
# torchvision
torchvision==0.20.1
# via timm
tqdm==4.67.1
# via huggingface-hub
triton==3.1.0
# via torch
typing-extensions==4.12.2
# via
# huggingface-hub
# pydantic
# pydantic-core
# torch
tzdata==2024.2
# via pandas
urllib3==2.3.0
# via requests
xarray==2025.1.0
# via microsoft-aurora (pyproject.toml)
12 changes: 12 additions & 0 deletions aurora/foundry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""

from aurora.foundry.client.api import SubmissionError, submit
from aurora.foundry.client.foundry import FoundryClient
from aurora.foundry.common.channel import BlobStorageCommunication

__all__ = [
"BlobStorageCommunication",
"FoundryClient",
"submit",
"SubmissionError",
]
1 change: 1 addition & 0 deletions aurora/foundry/client/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""
102 changes: 102 additions & 0 deletions aurora/foundry/client/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.

This is the API that the end user uses to submit jobs to the model running on Azure AI Foundry.
"""

import logging
from typing import Generator

from pydantic import BaseModel

from aurora import Batch
from aurora.foundry.client.foundry import AbstractFoundryClient
from aurora.foundry.common.channel import CommunicationChannel, iterate_prediction_files
from aurora.foundry.common.model import models

__all__ = ["SubmissionError", "submit"]

logger = logging.getLogger(__name__)


class SubmissionInfo(BaseModel):
task_id: str


class ProgressInfo(BaseModel):
task_id: str
completed: bool
progress_percentage: int
error: bool
error_info: str


class SubmissionError(Exception):
"""The submission could not be completed for some reason."""


def submit(
batch: Batch,
model_name: str,
num_steps: int,
client_comm: CommunicationChannel,
host_comm: CommunicationChannel,
foundry_client: AbstractFoundryClient,
) -> Generator[Batch, None, None]:
"""Submit a request to Azure AI Foundry and retrieve the predictions.

Args:
batch (:class:`aurora.Batch`): Initial condition.
model_name (str): Name of the model. This name must be available in
:mod:`aurora_foundry.common.model`.
num_steps (int): Number of prediction steps.
client_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the
client uses to send and receive data.
host_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the host
uses to send and receive data.
foundry_client (:class:`aurora_foundry.client.foundry.AbstractFoundryClient`): Client to
communicate with Azure Foundry AI.

Yields:
:class:`aurora.Batch`: Predictions.
"""
if model_name not in models:
raise KeyError(f"Model `{model_name}` is not a valid model.")

# Send a request to the endpoint to produce the predictions.
task = {
"model_name": model_name,
"num_steps": num_steps,
"data_folder_uri": host_comm.to_spec(),
}
response = foundry_client.submit_task(task)
try:
submission_info = SubmissionInfo(**response)
except Exception as e:
raise SubmissionError(response["message"]) from e
task_id = submission_info.task_id
logger.info("Submitted task %r to endpoint.", task_id)

# Send the initial condition over.
client_comm.send(batch, task_id, "input.nc")

previous_progress: int = 0

while True:
# Check on the progress of the task.
response = foundry_client.get_progress(task_id)
progress_info = ProgressInfo(**response)

if progress_info.error:
raise SubmissionError(f"Task failed: {progress_info.error_info}")

if progress_info.progress_percentage > previous_progress:
logger.info(f"Task progress update: {progress_info.progress_percentage}%.")
previous_progress = progress_info.progress_percentage

if progress_info.completed:
logger.info("Task has been completed!")
break

logger.info("Retrieving predictions.")
for prediction_name in iterate_prediction_files("prediction.nc", num_steps):
yield client_comm.receive(task_id, prediction_name)
Loading
Loading