microsoft · wesselb · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -17,14 +17,24 @@ jobs:
     name: Test with Python ${{ matrix.version }}
     steps:
       - uses: actions/checkout@v2
+
       - name: Set up Python ${{ matrix.version }}
         uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.version }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build Foundry image
+        run: |
+            DOCKER_IMAGE=aurora-foundry make docker
+
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install --upgrade --no-cache-dir -e '.[dev]'
+            python -m pip install --upgrade pip
+            python -m pip install --upgrade --no-cache-dir -e '.[dev]'
+
       - name: Run tests
         run: |
-            pytest -v --cov=aurora --cov-report term-missing
+            DOCKER_IMAGE=aurora-foundry pytest -v --cov=aurora --cov-report term-missing
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,45 @@
+# Use an official Python runtime as a parent image.
+FROM mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
+
+WORKDIR /aurora_foundry
+COPY ./pyproject.toml .
+
+# Assuming dependencies are fairly fixed, we can install them first and then copy the rest of the
+# code to avoid re-installing dependencies when the code changes.
+COPY _docker_requirements.txt .
+RUN pip install --upgrade pip virtualenv && \
+    virtualenv venv -p python3.10 && \
+    . venv/bin/activate && \
+    pip install -r _docker_requirements.txt
+
+# Download model weights.
+RUN ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-small-pretrained.ckpt")' && \
+    ./venv/bin/python -c 'from huggingface_hub import hf_hub_download; hf_hub_download(repo_id="microsoft/aurora", filename="aurora-0.25-finetuned.ckpt")'
+
+COPY ./LICENSE.txt .
+COPY ./README.md .
+
+# Install `azcopy` and the AML inference server.
+RUN wget https://aka.ms/downloadazcopy-v10-linux -O azcopy.tar.gz && \
+    cp $(tar -xvzf azcopy.tar.gz | grep azcopy$) /usr/local/bin/azcopy
+RUN . ./venv/bin/activate && \
+    pip install azureml-inference-server-http
+
+COPY ./aurora ./aurora
+ARG AURORA_REPO_VERSION
+RUN [ ! -z "${AURORA_REPO_VERSION}" ] || { echo "AURORA_REPO_VERSION must be set."; exit 1; } && \
+    . venv/bin/activate && \
+    SETUPTOOLS_SCM_PRETEND_VERSION="$AURORA_REPO_VERSION" pip install -e .
+
+# Make port 5001 available to the world outside this container.
+EXPOSE 5001
+ENV PORT=5001
+
+# we don't have a swagger2.json file, so we'll just "ignore" the version option and always return a version 3 file
+RUN cp ./aurora/foundry/server/swagger3.json ./swagger2.json && \
+    cp ./aurora/foundry/server/swagger3.json ./swagger2.0.json && \
+    cp ./aurora/foundry/server/swagger3.json ./swagger3.1.json && \
+    cp ./aurora/foundry/server/swagger3.json ./swagger3.0.json && \
+    cp ./aurora/foundry/server/swagger3.json ./swagger3.json
+
+CMD ["./venv/bin/azmlinfsrv", "--entry_script", "aurora/foundry/server/score.py"]
diff --git a/Makefile b/Makefile
@@ -1,13 +1,33 @@
-.PHONY: install test docs
+.PHONY: install test docs docker-requirements docker swagger-file
+
+DOCKER_WS ?= testwsacr
+DOCKER_IMAGE ?= aurora-foundry:20250110-1
 
 install:
 	pip install --upgrade pip
 	pip install -e ".[dev]"
 	pre-commit install
 
 test:
-	pytest tests -v --cov=aurora --cov-report=term --cov-report=html
+	DOCKER_IMAGE=$(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) pytest tests -v --cov=aurora --cov-report=term --cov-report=html
 
 docs:
 	jupyter-book build docs
 	cp -r docs/_extras/* docs/_build/html/
+
+docker-requirements: pyproject.toml
+	(pip show pip-tools 1>/dev/null) || pip install pip-tools
+	pip-compile --verbose --output-file _docker_requirements.txt pyproject.toml
+
+docker:
+	(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
+	AURORA_REPO_VERSION=`python -m setuptools_scm` docker build --build-arg AURORA_REPO_VERSION -t $(DOCKER_WS).azurecr.io/$(DOCKER_IMAGE) .
+
+docker-acr:
+	(pip show setuptools-scm 1>/dev/null) || pip install setuptools-scm
+	[ ! -z "$(ACR)" ]
+	AURORA_REPO_VERSION=`python -m setuptools_scm` az acr build --build-arg AURORA_REPO_VERSION -r "$(ACR)" -t $(DOCKER_IMAGE) .
+
+swagger-file:
+	pip install fastapi
+	python aurora/foundry/server/generate-swagger.py aurora/foundry/server/swagger3.json
diff --git a/_docker_requirements.txt b/_docker_requirements.txt
@@ -0,0 +1,137 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    pip-compile --output-file=requirements.txt pyproject.toml
+#
+annotated-types==0.7.0
+    # via pydantic
+certifi==2024.12.14
+    # via
+    #   netcdf4
+    #   requests
+cftime==1.6.4.post1
+    # via netcdf4
+charset-normalizer==3.4.1
+    # via requests
+einops==0.8.0
+    # via microsoft-aurora (pyproject.toml)
+filelock==3.16.1
+    # via
+    #   huggingface-hub
+    #   torch
+    #   triton
+fsspec==2024.12.0
+    # via
+    #   huggingface-hub
+    #   torch
+huggingface-hub==0.27.1
+    # via
+    #   microsoft-aurora (pyproject.toml)
+    #   timm
+idna==3.10
+    # via requests
+jinja2==3.1.5
+    # via torch
+markupsafe==3.0.2
+    # via jinja2
+mpmath==1.3.0
+    # via sympy
+netcdf4==1.7.2
+    # via microsoft-aurora (pyproject.toml)
+networkx==3.4.2
+    # via torch
+numpy==2.2.1
+    # via
+    #   cftime
+    #   microsoft-aurora (pyproject.toml)
+    #   netcdf4
+    #   pandas
+    #   scipy
+    #   torchvision
+    #   xarray
+nvidia-cublas-cu12==12.4.5.8
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.4.127
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.4.127
+    # via torch
+nvidia-cuda-runtime-cu12==12.4.127
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.2.1.3
+    # via torch
+nvidia-curand-cu12==10.3.5.147
+    # via torch
+nvidia-cusolver-cu12==11.6.1.9
+    # via torch
+nvidia-cusparse-cu12==12.3.1.170
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.21.5
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvtx-cu12==12.4.127
+    # via torch
+packaging==24.2
+    # via
+    #   huggingface-hub
+    #   xarray
+pandas==2.2.3
+    # via xarray
+pillow==11.1.0
+    # via torchvision
+pydantic==2.10.4
+    # via microsoft-aurora (pyproject.toml)
+pydantic-core==2.27.2
+    # via pydantic
+python-dateutil==2.9.0.post0
+    # via pandas
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   huggingface-hub
+    #   timm
+requests==2.32.3
+    # via huggingface-hub
+scipy==1.15.0
+    # via microsoft-aurora (pyproject.toml)
+six==1.17.0
+    # via python-dateutil
+sympy==1.13.1
+    # via torch
+timm==0.6.13
+    # via microsoft-aurora (pyproject.toml)
+torch==2.5.1
+    # via
+    #   microsoft-aurora (pyproject.toml)
+    #   timm
+    #   torchvision
+torchvision==0.20.1
+    # via timm
+tqdm==4.67.1
+    # via huggingface-hub
+triton==3.1.0
+    # via torch
+typing-extensions==4.12.2
+    # via
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   torch
+tzdata==2024.2
+    # via pandas
+urllib3==2.3.0
+    # via requests
+xarray==2025.1.0
+    # via microsoft-aurora (pyproject.toml)
diff --git a/aurora/foundry/__init__.py b/aurora/foundry/__init__.py
@@ -0,0 +1,12 @@
+"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""
+
+from aurora.foundry.client.api import SubmissionError, submit
+from aurora.foundry.client.foundry import FoundryClient
+from aurora.foundry.common.channel import BlobStorageCommunication
+
+__all__ = [
+    "BlobStorageCommunication",
+    "FoundryClient",
+    "submit",
+    "SubmissionError",
+]
diff --git a/aurora/foundry/client/__init__.py b/aurora/foundry/client/__init__.py
@@ -0,0 +1 @@
+"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""
diff --git a/aurora/foundry/client/api.py b/aurora/foundry/client/api.py
@@ -0,0 +1,102 @@
+"""Copyright (c) Microsoft Corporation. Licensed under the MIT license.
+
+This is the API that the end user uses to submit jobs to the model running on Azure AI Foundry.
+"""
+
+import logging
+from typing import Generator
+
+from pydantic import BaseModel
+
+from aurora import Batch
+from aurora.foundry.client.foundry import AbstractFoundryClient
+from aurora.foundry.common.channel import CommunicationChannel, iterate_prediction_files
+from aurora.foundry.common.model import models
+
+__all__ = ["SubmissionError", "submit"]
+
+logger = logging.getLogger(__name__)
+
+
+class SubmissionInfo(BaseModel):
+    task_id: str
+
+
+class ProgressInfo(BaseModel):
+    task_id: str
+    completed: bool
+    progress_percentage: int
+    error: bool
+    error_info: str
+
+
+class SubmissionError(Exception):
+    """The submission could not be completed for some reason."""
+
+
+def submit(
+    batch: Batch,
+    model_name: str,
+    num_steps: int,
+    client_comm: CommunicationChannel,
+    host_comm: CommunicationChannel,
+    foundry_client: AbstractFoundryClient,
+) -> Generator[Batch, None, None]:
+    """Submit a request to Azure AI Foundry and retrieve the predictions.
+
+    Args:
+        batch (:class:`aurora.Batch`): Initial condition.
+        model_name (str): Name of the model. This name must be available in
+            :mod:`aurora_foundry.common.model`.
+        num_steps (int): Number of prediction steps.
+        client_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the
+            client uses to send and receive data.
+        host_comm (:class:`aurora_foundry.common.comm.CommunicationChannel`): Channel that the host
+            uses to send and receive data.
+        foundry_client (:class:`aurora_foundry.client.foundry.AbstractFoundryClient`): Client to
+            communicate with Azure Foundry AI.
+
+    Yields:
+        :class:`aurora.Batch`: Predictions.
+    """
+    if model_name not in models:
+        raise KeyError(f"Model `{model_name}` is not a valid model.")
+
+    # Send a request to the endpoint to produce the predictions.
+    task = {
+        "model_name": model_name,
+        "num_steps": num_steps,
+        "data_folder_uri": host_comm.to_spec(),
+    }
+    response = foundry_client.submit_task(task)
+    try:
+        submission_info = SubmissionInfo(**response)
+    except Exception as e:
+        raise SubmissionError(response["message"]) from e
+    task_id = submission_info.task_id
+    logger.info("Submitted task %r to endpoint.", task_id)
+
+    # Send the initial condition over.
+    client_comm.send(batch, task_id, "input.nc")
+
+    previous_progress: int = 0
+
+    while True:
+        # Check on the progress of the task.
+        response = foundry_client.get_progress(task_id)
+        progress_info = ProgressInfo(**response)
+
+        if progress_info.error:
+            raise SubmissionError(f"Task failed: {progress_info.error_info}")
+
+        if progress_info.progress_percentage > previous_progress:
+            logger.info(f"Task progress update: {progress_info.progress_percentage}%.")
+            previous_progress = progress_info.progress_percentage
+
+        if progress_info.completed:
+            logger.info("Task has been completed!")
+            break
+
+    logger.info("Retrieving predictions.")
+    for prediction_name in iterate_prediction_files("prediction.nc", num_steps):
+        yield client_comm.receive(task_id, prediction_name)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Copyright (c) Microsoft Corporation. Licensed under the MIT license."""