Update

MeteoSwiss · Sep 18, 2024 · 6e048e8 · 6e048e8
2 parents 28d8a26 + 436fe5d
commit 6e048e8
Show file tree

Hide file tree

Showing 71 changed files with 3,660 additions and 372 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+CHANGELOG.md merge=union
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,6 @@
+# CODEOWNERS file
+
+# Protect workflow files
+/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
+/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
+/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb
diff --git a/.github/ci-hpc-config.yml b/.github/ci-hpc-config.yml
@@ -0,0 +1,20 @@
+build:
+  modules:
+    - ninja
+  dependencies:
+    - ecmwf/ecbuild@develop
+    - ecmwf/eccodes@develop
+    - ecmwf/eckit@develop
+    - ecmwf/odc@develop
+  python_dependencies:
+    - ecmwf/anemoi-utils@develop
+    - ecmwf/earthkit-data@develop
+    - ecmwf/earthkit-meteo@develop
+    - ecmwf/earthkit-geo@develop
+  parallel: 64
+
+  pytest_cmd: |
+    python -m pytest -vv -m 'not notebook and not no_cache_init' --cov=. --cov-report=xml
+    python -m pytest -v -m 'notebook'
+    python -m pytest --forked -vv -m 'no_cache_init'
+    python -m coverage report
diff --git a/.github/workflows/changelog-pr-update.yml b/.github/workflows/changelog-pr-update.yml
@@ -5,6 +5,9 @@ on:
     branches:
       - main
       - develop
+    paths-ignore:
+      - .pre-commit-config.yaml
+      - .readthedocs.yaml
 jobs:
   Check-Changelog:
     name: Check Changelog Action

diff --git a/.github/workflows/changelog-release-update.yml b/.github/workflows/changelog-release-update.yml
@@ -0,0 +1,34 @@
+# .github/workflows/update-changelog.yaml
+name: "Update Changelog"
+
+on:
+  release:
+    types: [released]
+
+permissions:
+  pull-requests: write
+  contents: write
+
+jobs:
+  update:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.event.release.target_commitish }}
+
+    - name: Update Changelog
+      uses: stefanzweifel/changelog-updater-action@v1
+      with:
+        latest-version: ${{ github.event.release.tag_name }}
+        heading-text: ${{ github.event.release.name }}
+
+    - name: Create Pull Request
+      uses: peter-evans/create-pull-request@v6
+      with:
+        branch: docs/changelog-update-${{ github.event.release.tag_name }}
+        title: '[Changelog] Update to ${{ github.event.release.tag_name }}'
+        add-paths: |
+          CHANGELOG.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,7 +34,7 @@ jobs:
   downstream-ci-hpc:
     name: downstream-ci-hpc
     if: ${{ !github.event.pull_request.head.repo.fork && github.event.action != 'labeled' || github.event.label.name == 'approved-for-ci' }}
-    uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci.yml@main
+    uses: ecmwf-actions/downstream-ci/.github/workflows/downstream-ci-hpc.yml@main
     with:
       anemoi-datasets: ecmwf/anemoi-datasets@${{ github.event.pull_request.head.sha || github.sha }}
     secrets: inherit
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -4,48 +4,17 @@
 name: Upload Python Package
 
 on:
-
-  push: {}
-  pull_request:
   release:
     types: [created]
 
 jobs:
   quality:
-    name: Code QA
-    runs-on: ubuntu-latest
-    steps:
-    - run: sudo apt-get install -y pandoc # Needed by sphinx for notebooks
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
-      with:
-        python-version: 3.x
-    - uses: pre-commit/[email protected]
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-precommit-run.yml@v2
+    with:
+      skip-hooks: "no-commit-to-branch"
 
   checks:
-    strategy:
-      fail-fast: false
-      matrix:
-        platform: ["ubuntu-latest", "macos-latest"]
-        python-version: ["3.10"]
-
-    name: Python  ${{ matrix.python-version }} on ${{ matrix.platform }}
-    runs-on: ${{ matrix.platform }}
-
-    steps:
-    - uses: actions/checkout@v4
-
-    - uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-
-    - name: Install
-      run: |
-        pip install -e .[all,tests]
-        pip freeze
-
-    - name: Tests
-      run: pytest
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-pytest-pyproject.yml@v2
 
   deploy:
     needs: [checks, quality]

diff --git a/.github/workflows/python-pull-request.yml b/.github/workflows/python-pull-request.yml
@@ -0,0 +1,23 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Code Quality checks for PRs
+
+on:
+  push:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  quality:
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-precommit-run.yml@v2
+    with:
+      skip-hooks: "no-commit-to-branch"
+
+  checks:
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10"]
+    uses: ecmwf-actions/reusable-workflows/.github/workflows/qa-pytest-pyproject.yml@v2
+    with:
+      python-version: ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
@@ -120,6 +120,7 @@ celerybeat.pid
 *.sage.py
 
 # Environments
+.envrc
 .env
 .venv
 env/
@@ -190,3 +191,6 @@ _build/
 _dev/
 *.to_upload
 *.tmp
+test.ipynb
+*tmp_data/
+tempCodeRunnerFile.python
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,8 +20,14 @@ repos:
   - id: no-commit-to-branch # Prevent committing to main / master
   - id: check-added-large-files # Check for large files added to git
   - id: check-merge-conflict # Check for files that contain merge conflict
+-   repo: https://github.com/pre-commit/pygrep-hooks
+    rev: v1.10.0  # Use the ref you want to point at
+    hooks:
+    -   id: python-use-type-annotations # Check for missing type annotations
+    -   id: python-check-blanket-noqa # Check for # noqa: all
+    -   id: python-no-log-warn # Check for log.warn
 - repo: https://github.com/psf/black-pre-commit-mirror
-  rev: 24.4.2
+  rev: 24.8.0
   hooks:
   - id: black
     args: [--line-length=120]
@@ -34,18 +40,18 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.4.6
+  rev: v0.6.4
   hooks:
   - id: ruff
-    # Next line is to exclude for documentation code snippets
-    exclude: 'docs/(.*/)?[a-z]\w+_.py$'
+    # Next line if for documenation cod snippets
+    exclude: '^[^_].*_\.py$'
     args:
     - --line-length=120
     - --fix
     - --exit-non-zero-on-fix
     - --preview
 - repo: https://github.com/sphinx-contrib/sphinx-lint
-  rev: v0.9.1
+  rev: v1.0.0
   hooks:
   - id: sphinx-lint
 # For now, we use it. But it does not support a lot of sphinx features
@@ -59,12 +65,10 @@ repos:
   hooks:
   - id: docconvert
     args: ["numpy"]
-- repo: https://github.com/b8raoult/optional-dependencies-all
-  rev: "0.0.6"
-  hooks:
-  - id: optional-dependencies-all
-    args: ["--inplace", "--exclude-keys=dev,docs,tests", "--group=dev=all,docs,tests"]
 - repo: https://github.com/tox-dev/pyproject-fmt
-  rev: "2.1.3"
+  rev: "2.2.3"
   hooks:
   - id: pyproject-fmt
+
+ci:
+  autoupdate_schedule: monthly
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,13 +11,29 @@ Keep it human-readable, your future self will thank you!
 ## [Unreleased]
 
 ### Added
+### Changed
+
+- Added incremental building of datasets
+
+### Removed
+
+## [0.4.5]
+
+### Added
+
+- New `interpolate_frequency` keyword in `open_dataset`
+- CI workflow to update the changelog on release
 - adds the reusable cd pypi workflow
+- merge strategy for changelog in .gitattributes #25
+- adds ci hpc config (#43)
 
 ### Changed
 
+- update CI to reusable workflows for PRs and releases
 - Support sub-hourly datasets.
 - Change negative variance detection to make it less restrictive
 - Fix cutout bug that left some global grid points in the lam part
+- Fix bug in computing missing dates in cutout option
 
 ### Removed
 

diff --git a/docs/building/incremental.rst b/docs/building/incremental.rst
@@ -0,0 +1,120 @@
+.. _create-incremental:
+
+##################################
+ Creating a dataset incrementally
+##################################
+
+This guide shows how to create a dataset incrementally. This is useful
+when you have a large dataset that you want to load in parts, to avoid
+running out of memory. Because parts can be loaded in parallel, this can
+also speed up the process.
+
+**********************
+ Building the dataset
+**********************
+
+You first need to create an empty dataset with the `init` command, which
+is similar to the `create` command. If there is already a dataset with
+the same name, you can use the `--overwrite` flag to replace it. The
+`init` command requires a YAML file with the dataset configuration and a
+name for the dataset. The content of the YAML file will be copied into
+the dataset, so it will not be needed by following commands.
+
+.. code:: bash
+
+   anemoi-datasets init dataset.yaml dataset.zarr --overwrite
+
+You can then load the dataset in parts with the `load` command. You just
+pass which part you want to load with the `--part` flag.
+
+.. note::
+
+   Parts are numbered from 1 to N, where N is the total number of parts
+   (unlike Python, where they would start at zero). This is to make it
+   easier to use the `seq(1)` command in bash.
+
+You can load multiple parts in any order and in parallel by running the
+`load` command in different terminals, slurm jobs or any other
+parallelisation tool. The library relies on the `zarr` library to handle
+concurrent writes.
+
+.. code:: bash
+
+   anemoi-datasets load dataset.zarr --part 1/20
+
+.. code:: bash
+
+   anemoi-datasets load dataset.zarr --part 2/20
+
+... and so on ... until:
+
+.. code:: bash
+
+   anemoi-datasets load dataset.zarr --part 20/20
+
+Once you have loaded all the parts, you can finalise the dataset with
+the `finalise` command. This will write the metadata and the attributes
+to the dataset, and consolidate the statistics and cleanup some
+temporary files.
+
+.. code:: bash
+
+   anemoi-datasets finalise dataset.zarr
+
+You can follow the progress of the dataset creation with the `inspect`
+command. This will show you the percentage of parts loaded.
+
+.. code:: bash
+
+   anemoi-datasets inspect dataset.zarr
+
+It is possible that some temporary files are left behind at the end of
+the process. You can clean them up with the `cleanup` command.
+
+.. code:: bash
+
+   anemoi-datasets cleanup dataset.zarr
+
+***********************
+ Additional statistics
+***********************
+
+`anemoi-datasets` can compute additional statistics for the dataset,
+mostly statistics of the increments between two dates (e.g. 6h or 12h).
+
+To add statistics for 6h increments:
+
+.. code:: bash
+
+   anemoi-datasets init-additions dataset.zarr --delta 6h
+   anemoi-datasets load-additions dataset.zarr --part 1/2 --delta 6h
+   anemoi-datasets load-additions dataset.zarr --part 2/2 --delta 6h
+   anemoi-datasets finalise-additions dataset.zarr --delta 6h
+
+To add statistics for 12h increments:
+
+.. code:: bash
+
+   anemoi-datasets init-additions dataset.zarr --delta 12h
+   anemoi-datasets load-additions dataset.zarr --part 1/2 --delta 12h
+   anemoi-datasets load-additions dataset.zarr --part 2/2 --delta 12h
+   anemoi-datasets finalise-additions dataset.zarr --delta 12h
+
+If this process leaves temporary files behind, you can clean them up
+with the `cleanup` command.
+
+.. code:: bash
+
+   anemoi-datasets cleanup dataset.zarr
+
+********************************
+ Patching the dataset metadata:
+********************************
+
+The following command will patch the dataset metadata. In particular, it
+will remove any references to the YAML file used to initialise the
+dataset.
+
+.. code:: bash
+
+   anemoi-datasets patch dataset.zarr
diff --git a/docs/building/syntax.rst b/docs/building/syntax.rst
diff --git a/docs/images.pptx b/docs/images.pptx