Refactoring configurations and examples.

cedadev · Nov 11, 2024 · 53f9c5a · 53f9c5a
1 parent 1a47f9b
commit 53f9c5a
Show file tree

Hide file tree

Showing 47 changed files with 1,030 additions and 1,349 deletions.
diff --git a/example/conf/collection-generator.yaml b/example/conf/collection-generator.yaml
@@ -7,7 +7,8 @@ recipes_root: recipes/
 # The input plugins to be run for the generator
 inputs:
   - name: text_file
-    filepath: input/collections.txt
+    conf:
+      filepath: input/collections.txt
 
 # The output plugins to be run for the generator
 outputs:
@@ -18,8 +19,9 @@ outputs:
         stac_version: '1.0.0'
         stac_extensions: []
   - name: json_file
-    dirpath: output/collections
-    filename_term: id
+    conf:
+      dirpath: output/collections
+      filename_term: id
     mappings:
       - name: stac
         stac_version: '1.0.0'

diff --git a/example/conf/item-generator.yaml b/example/conf/item-generator.yaml
@@ -7,7 +7,8 @@ recipes_root: recipes/
 # The input plugins to be run for the generator
 inputs:
   - name: text_file
-    filepath: input/assets.txt
+    conf:
+      filepath: input/assets.txt
 
 # The output plugins to be run for the generator
 outputs:
@@ -17,12 +18,7 @@ outputs:
       - name: stac
         stac_version: '1.0.0'
         stac_extensions: []
-  - name: json_file
-    dirpath: output/items
-    filename_term: id
-    mappings:
-      - name: stac
-        stac_version: '1.0.0'
-        stac_extensions: []
   - name: text_file
-    filepath: input/collections.txt
+    conf:
+      filepath: input/collections.txt
+
diff --git a/example/recipes/collection/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml b/example/recipes/collection/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml
@@ -1,20 +1,25 @@
-# The type of STAC record that will be generated
+# The type of record that will be generated
 type: collection
 
-# These extraction methods will be run after `extraction_methods` and should generate the id of the record
+# This section is optional and can be run seperatly to find the id of a record
 id:
   - method: default
     inputs:
       defaults:
-        collection_id: cmip6
+        id: dkrz
 
 # The extaction methods are run in series with the output dictionary is passed from one to the next
 # extaction methods add, update or remove the data from the output dictionary
 extraction_methods:
+  - method: default
+    inputs:
+      defaults:
+        id: dkrz
+
   - method: json_file
     inputs:
       dirpath: output/items/
-      terms:
+      properties:
         - mip_era
         - activity_id
         - institution_id

diff --git a/example/recipes/item/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml b/example/recipes/item/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml
@@ -3,15 +3,19 @@ paths:
   - https://cmip6-zarr-o.s3-ext.jc.rl.ac.uk/CMIP6.CMIP.MOHC.UKESM1-0-LL
   - https://cmip6-zarr-o.s3-ext.jc.rl.ac.uk/CMIP6.C4MIP.MOHC.UKESM1-0-LL
 
-# The type of STAC record that will be generated
+# The type of record that will be generated
 type: item
 
-# These extraction methods will be run after `extraction_methods` and should generate the id of the record
+# This section is optional and can be run seperatly to find the id of a record
 id:
-  - method: default
+  - method: regex
+    inputs:
+      regex: '\/(?P<mip_era>\w*)\.(?P<activity_id>\w*)\.(?P<institution_id>[\w-]*)\.(?P<source_id>[\w-]*)\/(?P<experiment_id>[\w-]*)\.(?P<member_id>\w*)\.(?P<table_id>\w*)\.(?P<var_id>\w*)\.(?P<grid_label>\w*)\.(?P<version>\w*)'
+
+  - method: string_template
     inputs:
-      defaults:
-        item_id: $instance_id
+      template: '{mip_era}.{activity_id}.{institution_id}.{source_id}.{table_id}.{var_id}.{version}'
+      output_key: id
 
 # The extaction methods are run in series with the output dictionary is passed from one to the next
 # extaction methods add, update or remove the data from the output dictionary
@@ -23,13 +27,14 @@ extraction_methods:
   - method: string_template
     inputs:
       template: '{mip_era}.{activity_id}.{institution_id}.{source_id}.{table_id}.{var_id}.{version}'
-      output_key: instance_id
+      output_key: id
 
 # Some extraction methods generate assets which can also include their own list of extration methods to be run on the assets
-  - method: intake_assets
+  - method: assets
     inputs:
-      uri: https://raw.githubusercontent.com/cedadev/cmip6-object-store/master/catalogs/ceda-zarr-cmip6.json
-      object_path_attr: zarr_path
+      backend: intake_esm
+      input_term: https://raw.githubusercontent.com/cedadev/cmip6-object-store/master/catalogs/ceda-zarr-cmip6.json
+      href_term: zarr_path
       search_kwargs:
         mip_era: $mip_era
         activity_id: $activity_id
@@ -47,7 +52,7 @@ extraction_methods:
   - method: lambda
     inputs:
       function: 'lambda assets: {f"data{str(en+1).zfill(4)}": assets[key] for en, key in enumerate(sorted(assets))}'
-      input_args:
+      args:
         - $assets
       output_key: assets
 
@@ -58,4 +63,4 @@ extraction_methods:
 
 # member of defines the other recipes that define a parent of this record
 member_of:
-  - recipes/collection/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml
+  - recipes/collection/CMIP6.CMIP.MOHC.UKESM1-0-LL.yaml
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,11 +9,12 @@ homepage = "https://github.com/cedadev/stac-generator"
 keywords = ['stac', 'metadata', 'plugin', 'framework']
 
 packages = [
-    { include = "stac_generator"},
+    { include = "stac_generator" }
 ]
 
 [tool.poetry.scripts]
 stac_generator = 'stac_generator.scripts.stac_generator:main'
+recipe_keys = 'stac_generator.scripts.recipe_keys:main'
 
 [tool.poetry.dependencies]
 python = "^3.10"
@@ -89,37 +90,34 @@ ignore_missing_imports = true
 packages = "stac_generator"
 
 [tool.poetry.plugins."stac_generator.inputs"]
+elasticsearch_aggregation = "stac_generator.plugins.inputs.elasticsearch_aggregation:ElasticsearchAggregationInput"
 file_system = "stac_generator.plugins.inputs.file_system:FileSystemInput"
-object_store = "stac_generator.plugins.inputs.object_store:ObjectStoreInput"
 intake_esm = "stac_generator.plugins.inputs.intake_esm:IntakeESMInput"
+object_store = "stac_generator.plugins.inputs.object_store:ObjectStoreInput"
 rabbitmq = "stac_generator.plugins.inputs.rabbit_mq:RabbitMQInput"
-thredds = "stac_generator.plugins.inputs.thredds:ThreddsInput"
-text_file = "stac_generator.plugins.inputs.text_file:TextFileInput"
 solr = "stac_generator.plugins.inputs.solr:SolrInput"
-elasticsearch = "stac_generator.plugins.inputs.elasticsearch:ElasticsearchInput"
+text_file = "stac_generator.plugins.inputs.text_file:TextFileInput"
+thredds = "stac_generator.plugins.inputs.thredds:ThreddsInput"
 
 [tool.poetry.plugins."stac_generator.outputs"]
-standard_out = "stac_generator.plugins.outputs.standard_out:StandardOutOutput"
-standard_out_bulk = "stac_generator.plugins.bulk_outputs.standard_out:StandardOutBulkOutput"
 elasticsearch = "stac_generator.plugins.outputs.elasticsearch:ElasticsearchOutput"
 elasticsearch_bulk = "stac_generator.plugins.bulk_outputs.elasticsearch:ElasticsearchBulkOutput"
-stacapi = "stac_generator.plugins.outputs.stacapi_backend:StacApiOutputBackend"
-text_file = "stac_generator.plugins.outputs.text_file:TextFileOutput"
+intake_esm = "stac_generator.plugins.outputs.intake_esm:IntakeESMOutput"
 json_file = "stac_generator.plugins.outputs.json_file:JsonFileOutput"
 rabbitmq = "stac_generator.plugins.outputs.rabbit_mq:RabbitMQOutput"
 rabbitmq_bulk = "stac_generator.plugins.bulk_outputs.rabbit_mq:RabbitMQBulkOutput"
-intake_esm = "stac_generator.plugins.outputs.intake_esm:IntakeESMOutput"
 stac_fastapi = "stac_generator.plugins.outputs.stac_fastapi:STACFastAPIOutput"
+standard_out = "stac_generator.plugins.outputs.standard_out:StandardOutOutput"
+standard_out_bulk = "stac_generator.plugins.bulk_outputs.standard_out:StandardOutBulkOutput"
+text_file = "stac_generator.plugins.outputs.text_file:TextFileOutput"
 
 [tool.poetry.plugins."stac_generator.mappings"]
 ceda = "stac_generator.plugins.mappings.ceda:CEDAMapping"
 stac = "stac_generator.plugins.mappings.stac:STACMapping"
 jinja = "stac_generator.plugins.mappings.jinja2:Jinja2Mapping"
 
-[tool.poetry.plugins."stac_generator.generators"]
-asset = "stac_generator.plugins.generators.asset:AssetGenerator"
-item = "stac_generator.plugins.generators.item:ItemGenerator"
-collection = "stac_generator.plugins.generators.collection:CollectionGenerator"
+[tool.poetry.plugins."stac_generator.generator"]
+generator = "stac_generator.core.generator:Generator"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/stac_generator/core/bulk_output.py b/stac_generator/core/bulk_output.py
@@ -8,29 +8,38 @@
 __license__ = "BSD - see LICENSE file in top-level package directory"
 __contact__ = "[email protected]"
 
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 
 from cachetools import Cache
+from pydantic import BaseModel, Field
 
+from stac_generator.core.process_config import SetConfig
 
-class BaseBulkOutput(ABC):
+
+class BulkOutputConf(BaseModel):
+    """Elasticsearch config model."""
+
+    cache_max_size: str = Field(
+        description="Max size of cache.",
+    )
+
+
+class BulkOutput(SetConfig):
     """
     Base class to define an bulk output
     """
 
+    config_class = BulkOutputConf
+
     def __init__(self, **kwargs):
         """
         Set the kwargs to generate instance attributes of the same name and create cache
 
         :param kwargs:
         """
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-
-        if not hasattr(self, "cache_max_size"):
-            self.cache_max_size = 100
+        super().__init__(**kwargs)
 
-        self.data_cache = Cache(maxsize=self.cache_max_size + 1)
+        self.data_cache = Cache(maxsize=self.conf.cache_max_size + 1)
 
     def __del__(self):
         self.clear_cache()
@@ -68,7 +77,7 @@ def run(self, data: dict) -> None:
         # add to cache
         self.data_cache.update(self.data_to_cache(data))
 
-        if self.data_cache.currsize >= self.cache_max_size:
+        if self.data_cache.currsize >= self.conf.cache_max_size:
             self.clear_cache()
 
     def clear_cache(self) -> None:

diff --git a/stac_generator/core/extraction_method.py b/stac_generator/core/extraction_method.py