esm-tools · siligam · Jul 25, 2024 · Jul 30, 2024 · Aug 1, 2024 · Aug 5, 2024
diff --git a/examples/sample.yaml b/examples/sample.yaml
@@ -29,7 +29,10 @@ rules:
     output_directory: .
     variant_label: r1i1p1f1
     experiment_id: piControl
-    source_id: ocean
+    #source_id: ocean
+    source_id: AWI-CM-1-1-HR
+    model_component: seaIce
+    grid_label: gn
     pipelines:
       - default
   - name: Salinity of the Ocean
@@ -42,7 +45,10 @@ rules:
     output_directory: .
     variant_label: r1i1p1f1
     experiment_id: piControl
-    source_id: ocean
+    #source_id: ocean
+    source_id: AWI-CM-1-1-HR
+    model_component: seaIce
+    grid_label: gn
     pipelines:
       - default
   - name: paul_example_rule
@@ -55,7 +61,10 @@ rules:
     output_directory: .
     variant_label: r1i1p1f1
     experiment_id: piControl
-    source_id: ocean
+    #source_id: ocean
+    source_id: AWI-CM-1-1-HR
+    model_component: ocean
+    grid_label: gn
     pipelines:
       - default
 pipelines:
@@ -65,6 +74,7 @@ pipelines:
       - "pymorize.generic.get_variable"
       - "pymorize.timeaverage.compute_average"
       - "pymorize.units.handle_unit_conversion"
+      - "pymorize.global_attributes.set_global_attributes"
       - "pymorize.generic.trigger_compute"
       - "pymorize.generic.show_data"
       - "pymorize.files.save_dataset"

diff --git a/src/pymorize/controlled_vocabularies.py b/src/pymorize/controlled_vocabularies.py
@@ -5,6 +5,9 @@
 import glob
 import json
 import os
+import re
+
+import requests
 
 
 class ControlledVocabularies(dict):
@@ -66,3 +69,53 @@ def dict_from_json_file(path):
                 return json.load(file)
         except json.JSONDecodeError as e:
             raise ValueError(f"file {path}: {e.msg}")
+
+    @classmethod
+    def load_from_git(cls, tag: str = "6.2.58.64"):
+        """Load the controlled vocabularies from the git repository
+
+        Parameters
+        ----------
+        tag : str
+            The git tag to use. Default is 6.2.58.64
+            If tag is None, the main branch is used.
+        Returns
+        -------
+        ControlledVocabularies
+            A new ControlledVocabularies object, behaves like a dictionary.
+        """
+        if tag is None:
+            tag = "refs/heads/main"
+        else:
+            tag = "refs/tags/" + tag
+        url = f"https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/{tag}"
+        filenames = (
+            "CMIP6_DRS.json",
+            "CMIP6_activity_id.json",
+            "CMIP6_experiment_id.json",
+            "CMIP6_frequency.json",
+            "CMIP6_grid_label.json",
+            "CMIP6_institution_id.json",
+            "CMIP6_license.json",
+            "CMIP6_nominal_resolution.json",
+            "CMIP6_realm.json",
+            "CMIP6_required_global_attributes.json",
+            "CMIP6_source_id.json",
+            "CMIP6_source_type.json",
+            "CMIP6_sub_experiment_id.json",
+            "CMIP6_table_id.json",
+            "mip_era.json",
+        )
+        name_pattern = re.compile(r"^(?:CMIP6_)?(?P<name>[^\.]+)\.json$").match
+        data = {}
+        for fname in filenames:
+            name = name_pattern(fname).groupdict().get("name")
+            fpath = "/".join([url, fname])
+            r = requests.get(fpath)
+            r.raise_for_status()
+            content = r.content.decode()
+            content = json.loads(content)
+            data[name] = content.get(name)
+        obj = cls([])
+        obj.update(data)
+        return obj
diff --git a/src/pymorize/global_attributes.py b/src/pymorize/global_attributes.py
@@ -0,0 +1,204 @@
+# global_attributes.py
+
+import re
+from datetime import datetime
+
+from .controlled_vocabularies import ControlledVocabularies
+
+# from loguru import logger
+
+data = ControlledVocabularies.load_from_git()
+
+required_global_attributes = data["required_global_attributes"]
-required_global_attributes = data["required_global_attributes"]
+_REQUIRED_GLOBAL_ATTRIBUTES = data["required_global_attributes"]
+"""dict: Global attributes that must be attached to the dataset to conform to CMIP6 standards"""
-required_global_attributes = data["required_global_attributes"]
+_REQUIRED_GLOBAL_ATTRIBUTES = data["required_global_attributes"]
+"""dict: Global attributes that must be attached to the dataset to conform to CMIP6 standards"""
+
+_parent_fields = (
+    "branch_method",
+    "branch_time_in_child",
+    "branch_time_in_parent",
+    "parent_experiment_id",
+    "parent_mip_era",
+    "parent_source_id",
+    "parent_time_units",
+    "parent_variant_label",
+)
+
+
+"""
+attribute dependencies
+----------------------
+Table header
+------------
+data_specs_version
+Conventions
+mip_era
+realm
+product
+frequency
+
+CV
+---
+source_id  <user input>
+    source
+    institution_id
+    license_info
+    model_component  # how to get model_component <user input>
+        native_nominal_resolution (nominal_resolution)
+        description (grid)
+experiment_id
+    activity_id
+    parent_experiment_id
+    sub_experiment_id
+
+User input
+----------
+table_id
+further_info_url
+institution
+variant_label
+    initialization_index
+    realization_index
+    forcing_index
+    physics_index
+
+system generated
+----------------
+creation_date
+tracking_id
+"""
+
+
+def _parse_variant_label(label: str) -> dict:
+    """Extracts indices values from variant label.
+    `label` must be of the form "r<int>i<int>p<int>f<int>".
+    Example
+    -------
+    >>> label = "r1i1p1f1"
+    >>> _parse_variant_label(label)
+    {"realization_index": 1, "initialization_index": 1, "physics_index": 1, "forcing_index": 1,}
+    """
+    pattern = re.compile(
+        r"r(?P<realization_index>\d+)"
+        r"i(?P<initialization_index>\d+)"
+        r"p(?P<physics_index>\d+)"
+        r"f(?P<forcing_index>\d+)"
+        r"$"
+    )
+    if label is None:
+        raise ValueError(
+            f"`label` must be of the form 'r<int>i<int>p<int>f<int>', Got: {label}"
+        )
+    d = pattern.match(label)
+    if d is None:
+        raise ValueError(
+            f"`label` must be of the form 'r<int>i<int>p<int>f<int>', Got: {label}"
+        )
+    d = {name: int(val) for name, val in d.groupdict().items()}
+    return d
+
+
+def _source_id_related(rule):
+    source_id = rule.source_id
+    cv = data["source_id"][source_id]
+    _inst_id = getattr(rule, "institution_id", None)
+    inst_id = cv["institution_id"]
+    if _inst_id:
+        assert _inst_id in inst_id
+    else:
+        if len(inst_id) > 1:
+            raise ValueError(
+                f"Provide institution_id. Mutiple values for institution_id found {inst_id}"
+            )
+        _inst_id = next(iter(inst_id))
+    model_components = cv["model_component"]
+    model_component = getattr(rule, "model_component", None)
+    if model_component:
+        assert model_component in model_components
+    else:
+        raise ValueError("Missing required attribute 'model_component'")
+    grid = model_components[model_component]["description"]
+    nominal_resolution = model_components[model_component]["native_nominal_resolution"]
+    license_id = cv["license_info"]["id"]
+    license_url = data["license"]["license_options"][license_id]["license_url"]
+    license_id = data["license"]["license_options"][license_id]["license_id"]
+    license_text = data["license"]["license"]
+    # make placeholders in license text
+    license_text = re.sub(r"<.*?>", "{}", license_text)
+    further_info_url = getattr(rule, "further_info_url", None)
+    if further_info_url is None:
+        license_text = re.sub(r"\[.*?\]", "", license_text)
+        license_text = license_text.format(_inst_id, license_id, license_url)
+    else:
+        license_text = license_text.format(
+            _inst_id, license_id, license_url, further_info_url
+        )
+    grid_label = getattr(rule, "grid_label", None)
+    if grid_label is None:
+        raise ValueError("Missing required attribute `grid_label`")
+    return {
+        "source_id": source_id,
+        "source": f"{model_component} ({cv['release_year']})",
+        "institution_id": _inst_id,
+        "institution": data["institution_id"][_inst_id],
+        "grid": grid,
+        "grid_label": grid_label,
+        "nominal_resolution": nominal_resolution,
+        "license": license_text,
+    }
+
+
+def _experiment_id_related(rule):
+    exp_id = rule.experiment_id
+    cv = data["experiment_id"][exp_id]
+    _activity_id = getattr(rule, "activity_id", None)
+    activity_id = cv["activity_id"]
+    if _activity_id:
+        assert _activity_id in activity_id
+    else:
+        if len(activity_id) > 1:
+            raise ValueError(f"Mutiple activity_id found {activity_id}")
+        _activity_id = next(iter(activity_id))
+    return {
+        "activity_id": _activity_id,
+        "experiment_id": exp_id,
+        "experiment": cv["experiment"],
+        "sub_experiment_id": " ".join(cv["sub_experiment_id"]),
+        "source_type": " ".join(cv["required_model_components"]),
+    }
+
+
+def _header_related(rule):
+    d = {}
+    drv = rule.data_request_variable
+    header = rule.data_request_variable.table_header
+    d["table_id"] = header.table_id
+    d["mip_era"] = header.mip_era
+    d["realm"] = header.realm
+    d["frequency"] = drv.frequency
+    d["Conventions"] = header.Conventions
+    d["product"] = header.product
+    d["data_specs_version"] = str(header.data_specs_version)
+    return d
+
+
+def _set_global_attributes(rule):
+    d = {}
+    d["variable_id"] = rule.cmor_variable
+    d["variant_label"] = rule.variant_label
+    d.update(_header_related(rule))
+    d.update(_parse_variant_label(rule.variant_label))
+    d.update(_source_id_related(rule))
+    d.update(_experiment_id_related(rule))
+    d = {k: d[k] for k in sorted(d)}
+    return d
+
+
+def set_global_attributes(ds, rule):
+    d = _set_global_attributes(rule)
+    # this needs to be discussed. For now setting it to today's datetime
+    # file creation date or today
+    d["creation_date"] = str(datetime.today())
+    # how to get proper tracking_id is yet to be determined
+    # This is just the tracking prefix
+    d["tracking_id"] = "hdl:21.14100"
+    ds.attrs.update(d)
+    return ds
diff --git a/src/pymorize/global_attributes_checklist.org b/src/pymorize/global_attributes_checklist.org
@@ -0,0 +1,86 @@
+* GLOBAL ATTRIBUTES
+
+reference CMIP6_required_global_attributes.json
+
+| index | NAME                 | IMPLEMENTED | SOURCE        | EXAMPLE                                           |
+|-------+----------------------+-------------+---------------+---------------------------------------------------|
+|     1 | activity_id          | x           | USER          | CMIP                                              |
+|     2 | Conventions          | x           | table         | CF-1.7 CMIP-6.2                                   |
+|     3 | creation_date        |             |               | 2018-12-18T12:00:00Z                              |
+|     4 | data_specs_version   | x           | USER / table  | 01.00.27                                          |
+|     5 | experiment           |             |               | piControl                                         |
+|     6 | experiment_id        |             |               | piControl                                         |
+|     7 | forcing_index        | x           | derived from  | 1                                                 |
+|       |                      |             | variant_label |                                                   |
+|     8 | frequency            | x           | table         | mon                                               |
+|     9 | further_info_url     | x           | USER          | *too_long_to_list_here                            |
+|       |                      |             | (optional,    |                                                   |
+|       |                      |             | default:      |                                                   |
+|       |                      |             | None)         |                                                   |
+|    10 | grid                 |             |               | *too_long_to_list_here                            |
+|    11 | grid_label           |             |               | gn                                                |
+|    12 | initialization_index | x           | derived from  | 1                                                 |
+|       |                      |             | variant_label |                                                   |
+|    13 | institution          |             |               | *too_long_to_list_here                            |
+|    14 | institution_id       | x           | using         | AWI                                               |
+|       |                      |             | default: AWI  |                                                   |
+|    15 | license              | x           | CV            | *too_long_to_list_here                            |
+|    16 | mip_era              | x           | table         | CMIP6                                             |
+|    17 | nominal_resolution   |             |               | 25 km                                             |
+|    18 | physics_index        | x           | derived from  | 1                                                 |
+|       |                      |             | variant_label |                                                   |
+|    19 | product              | x           | table         | model-output                                      |
+|    20 | realization_index    | x           | derived from  | 1                                                 |
+|       |                      |             | variant_label |                                                   |
+|    21 | realm                | x           | table         | ocean                                             |
+|    22 | source               |             |               | AWI-CM-1-1-MR                                     |
+|    23 | source_id            |             |               | AWI-CM-1-1-MR                                     |
+|    24 | source_type          |             |               | AOGCM                                             |
+|    25 | sub_experiment       |             |               | none                                              |
+|    26 | sub_experiment_id    |             |               | none                                              |
+|    27 | table_id             | x           | USER / use    | Omon                                              |
+|       |                      |             | all matching  |                                                   |
+|       |                      |             | tables        |                                                   |
+|    28 | tracking_id          |             |               | hdl:21.14100/84bfc093-b0a3-44ee-b733-91239b6fa6b2 |
+|    29 | variable_id          | x           | USER          | fgco2                                             |
+|    30 | variant_label        | x           | USER          | r1i1p1f1                                          |
+
+
+
+EXAMPLE
+  - further_info_url: "https://furtherinfo.es-doc.org/CMIP6.AWI.AWI-CM-1-1-MR.piControl.none.r1i1p1f1" ;
+  - grid: "FESOM 1.4 (unstructured grid in the horizontal with 830305 wet nodes; 46 levels; top grid cell 0-5 m)"
+  - institution: "Alfred Wegener Institute, Helmholtz Centre for Polar and Marine Research, Am Handelshafen 12, 27570 Bremerhaven, Germany"
+  - license: "CMIP6 model data produced by Alfred Wegener Institute, Helmholtz
+               Centre for Polar and Marine Research, Am Handelshafen 12, 27570 Bremerhaven,
+               Germany is licensed under a Creative Commons Attribution-ShareAlike 4.0
+               International License (https://creativecommons.org/licenses/). Consult
+               https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6
+               output, including citation requirements and proper acknowledgment. Further
+               information about this data, including some limitations, can be found via
+               the further_info_url (recorded as a global attribute in this file). The data
+               producers and data providers make no warranty, either express or implied,
+               including, but not limited to, warranties of merchantability and fitness for
+               a particular purpose. All liabilities arising from the supply of the
+               information (including any liability arising in negligence) are excluded to
+               the fullest extent permitted by law."
+  - branch_method: "standard"
+  - branch_time_in_child: 0.
+  - branch_time_in_parent: 182622.
+  - parent_activity_id: "CMIP"
+  - parent_experiment_id: "piControl-spinup"
+  - parent_mip_era: "CMIP6"
+  - parent_source_id: "AWI-CM-1-1-MR"
+  - parent_time_units: "days since 1901-1-1"
+  - parent_variant_label: "r1i1p1f1"
+
+
+COMMENT
+  4. `data_specs_version`: At the moment using Tables with a specific version ("01.00.13" for instance).
+     Exposing this attribute to user means, fetching user defined version tables from git.
+     Currently not implemented.
+  27. `table_id`: [Optional] A CMOR_variable may be in more than one table.
+     If user does not specify a table_id, then all matching table for this variable
+     is considered.
+
+