From ae5384a7f86a5e3e964bb9fe9301fedf13afd5bc Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 7 Nov 2024 12:26:45 +0000 Subject: [PATCH 01/78] Captured init arguments Signed-off-by: Elena Khaustova --- kedro/io/core.py | 20 ++++++++++++++++++++ kedro/io/kedro_data_catalog.py | 17 +++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/kedro/io/core.py b/kedro/io/core.py index 01e85963b9..7ffaaac41a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -148,6 +148,12 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): need to change the `_EPHEMERAL` attribute to 'True'. """ _EPHEMERAL = False + _config: dict[str, Any] = None + + def __post_init__(self, *args, **kwargs): + # print("args:", args) + # print("kwargs", kwargs) + self._config = kwargs @classmethod def from_config( @@ -201,6 +207,9 @@ def from_config( ) from err return dataset + def to_config(self) -> dict[str, Any]: + return self._config + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -286,6 +295,17 @@ def __init_subclass__(cls, **kwargs: Any) -> None: If `_load` or `_save` are defined, alias them as a prerequisite. """ + + def init_decorator(previous_init): + def new_init(self, *args, **kwargs): + previous_init(self, *args, **kwargs) + if type(self) is cls: + self.__post_init__(*args, **kwargs) + + return new_init + + cls.__init__ = init_decorator(cls.__init__) + super().__init_subclass__(**kwargs) if hasattr(cls, "_load") and not cls._load.__qualname__.startswith("Abstract"): diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 8bbf573d7e..240801d157 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -264,6 +264,23 @@ def _ipython_key_completions_(self) -> list[str]: def _logger(self) -> logging.Logger: return logging.getLogger(__name__) + def to_config( + self, + ) -> tuple[ + dict[str, dict[str, Any]], dict[str, dict[str, Any]], dict[str, str], str + ]: + catalog: dict[str, dict[str, Any]] = {} + credentials: dict[str, dict[str, Any]] = {} + + for ds_name, ds in self._datasets.items(): + ds_config = ds.to_config() + # print(ds) + # print(ds_config) + # print() + catalog[ds_name] = ds_config + + return catalog, credentials, self._load_versions, self._save_version + @classmethod def from_config( cls, From fcdf357e8c06d9b39e7966c1855d4d2c16d1327d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 7 Nov 2024 16:39:31 +0000 Subject: [PATCH 02/78] Implemented unresoloving credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 23 +++++++++++++++++++++++ kedro/io/core.py | 2 ++ kedro/io/kedro_data_catalog.py | 15 ++++++++++----- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index f722bedb6e..8322d9627f 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -261,6 +261,29 @@ def _resolve_config_credentials( return resolved_configs + @classmethod + def unresolve_config_credentials( + cls, ds_name: str, ds_config: dict[str, dict[str, Any]] | None + ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: + ds_config = ds_config or {} + credentials = {} + credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" + + def unresolve(config: Any): + if credentials: + return + for key, val in config.items(): + if key == CREDENTIALS_KEY: + credentials[credentials_ref] = config[key] + config[key] = credentials_ref + return + if isinstance(val, dict): + unresolve(val) + + unresolve(ds_config) + + return ds_config, credentials + def resolve_pattern(self, ds_name: str) -> dict[str, Any]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" matched_pattern = self.match_pattern(ds_name) diff --git a/kedro/io/core.py b/kedro/io/core.py index 7ffaaac41a..1ba8986cca 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -208,6 +208,8 @@ def from_config( return dataset def to_config(self) -> dict[str, Any]: + if "type" not in self._config: + self._config["type"] = f"{type(self).__module__}.{type(self).__name__}" return self._config @property diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 240801d157..46475987d3 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -273,11 +273,16 @@ def to_config( credentials: dict[str, dict[str, Any]] = {} for ds_name, ds in self._datasets.items(): - ds_config = ds.to_config() - # print(ds) - # print(ds_config) - # print() - catalog[ds_name] = ds_config + cur_config, cur_credentials = ( + self._config_resolver.unresolve_config_credentials( + ds_name, ds.to_config() + ) + ) + catalog[ds_name] = cur_config + credentials.update(cur_credentials) + + # print(catalog) + # print(credentials) return catalog, credentials, self._load_versions, self._save_version From b14737419be03d5105e12768f2238dfdcf921968 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 7 Nov 2024 16:43:44 +0000 Subject: [PATCH 03/78] Added some comments Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 8322d9627f..e8708cc8a1 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -270,6 +270,8 @@ def unresolve_config_credentials( credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" def unresolve(config: Any): + # We don't expect credentials key appears more than once in the config, + # So once we found the key first time we unresolve it and stop iterating after if credentials: return for key, val in config.items(): From 0ed0c1e934b874455830f1b3b7243a4ac8a73344 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 7 Nov 2024 16:56:48 +0000 Subject: [PATCH 04/78] Put type in first place for dataset config Signed-off-by: Elena Khaustova --- kedro/io/core.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 1ba8986cca..527e6ba079 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -151,8 +151,7 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): _config: dict[str, Any] = None def __post_init__(self, *args, **kwargs): - # print("args:", args) - # print("kwargs", kwargs) + # TODO: decide what to do with args self._config = kwargs @classmethod @@ -208,9 +207,10 @@ def from_config( return dataset def to_config(self) -> dict[str, Any]: - if "type" not in self._config: - self._config["type"] = f"{type(self).__module__}.{type(self).__name__}" - return self._config + return_config = {"type": f"{type(self).__module__}.{type(self).__name__}"} + return_config.update(self._config) + + return return_config @property def _logger(self) -> logging.Logger: From 3c839a9c83c8b82d303bdc29ae83cae4e969d05c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 7 Nov 2024 18:55:30 +0000 Subject: [PATCH 05/78] Handled version key Signed-off-by: Elena Khaustova --- kedro/io/core.py | 22 +++++++++++++++++----- kedro/io/kedro_data_catalog.py | 23 +++++++++++++++++------ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 527e6ba079..6e0905ab75 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -57,6 +57,7 @@ "s3a", "s3n", ) +TYPE_KEY = "type" class DatasetError(Exception): @@ -206,11 +207,22 @@ def from_config( ) from err return dataset - def to_config(self) -> dict[str, Any]: - return_config = {"type": f"{type(self).__module__}.{type(self).__name__}"} + def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: + # TODO: pop data key for MemoryDataset + # TODO: check other datasets + return_config = { + f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" + } + load_versions: dict[str, str] | None = None + save_version: str | None = None + return_config.update(self._config) - return return_config + if VERSION_KEY in return_config: + version = return_config.pop(VERSION_KEY) + load_versions, save_version = version.load, version.save + + return return_config, load_versions, save_version @property def _logger(self) -> logging.Logger: @@ -506,14 +518,14 @@ def parse_dataset_definition( config = copy.deepcopy(config) # TODO: remove when removing old catalog as moved to KedroDataCatalog - if "type" not in config: + if TYPE_KEY not in config: raise DatasetError( "'type' is missing from dataset catalog configuration." "\nHint: If this catalog entry is intended for variable interpolation, " "make sure that the top level key is preceded by an underscore." ) - dataset_type = config.pop("type") + dataset_type = config.pop(TYPE_KEY) class_obj = None if isinstance(dataset_type, str): if len(dataset_type.strip(".")) != len(dataset_type): diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 46475987d3..ac46b8ce42 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -267,24 +267,35 @@ def _logger(self) -> logging.Logger: def to_config( self, ) -> tuple[ - dict[str, dict[str, Any]], dict[str, dict[str, Any]], dict[str, str], str + dict[str, dict[str, Any]], + dict[str, dict[str, Any]], + dict[str, dict[str, str] | None], + dict[str, str | None], ]: + # TODO: process lazy loaded datasets catalog: dict[str, dict[str, Any]] = {} credentials: dict[str, dict[str, Any]] = {} + load_versions: dict[str : dict[str, str] | None] = {} + save_version: dict[str, str | None] = {} for ds_name, ds in self._datasets.items(): - cur_config, cur_credentials = ( + resolved_config, cur_load_versions, cur_save_version = ds.to_config() + unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials( - ds_name, ds.to_config() + ds_name, resolved_config ) ) - catalog[ds_name] = cur_config - credentials.update(cur_credentials) + catalog[ds_name] = unresolved_config + credentials.update(unresolved_credentials) + load_versions[ds_name] = cur_load_versions + save_version[ds_name] = cur_save_version # print(catalog) # print(credentials) + # print(load_versions) + # print(save_version) - return catalog, credentials, self._load_versions, self._save_version + return catalog, credentials, load_versions, save_version @classmethod def from_config( From 29bb714b69d036dd9e2d845e8bc257591c4ae1b5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 8 Nov 2024 11:38:56 +0000 Subject: [PATCH 06/78] Added lazy dataset to_config Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 +- kedro/io/kedro_data_catalog.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index e8708cc8a1..2a3ae8d083 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -265,7 +265,7 @@ def _resolve_config_credentials( def unresolve_config_credentials( cls, ds_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: - ds_config = ds_config or {} + ds_config = copy.deepcopy(ds_config) or {} credentials = {} credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index ac46b8ce42..eaf5b4822c 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -272,12 +272,20 @@ def to_config( dict[str, dict[str, str] | None], dict[str, str | None], ]: - # TODO: process lazy loaded datasets catalog: dict[str, dict[str, Any]] = {} credentials: dict[str, dict[str, Any]] = {} - load_versions: dict[str : dict[str, str] | None] = {} + load_version: dict[str, dict[str, str] | None] = {} save_version: dict[str, str | None] = {} + for ds_name, ds in self._lazy_datasets.items(): + unresolved_config, unresolved_credentials = ( + self._config_resolver.unresolve_config_credentials(ds_name, ds.config) + ) + catalog[ds_name] = unresolved_config + credentials.update(unresolved_credentials) + load_version[ds_name] = ds.load_version + save_version[ds_name] = ds.save_version + for ds_name, ds in self._datasets.items(): resolved_config, cur_load_versions, cur_save_version = ds.to_config() unresolved_config, unresolved_credentials = ( @@ -287,15 +295,15 @@ def to_config( ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) - load_versions[ds_name] = cur_load_versions + load_version[ds_name] = cur_load_versions save_version[ds_name] = cur_save_version # print(catalog) # print(credentials) - # print(load_versions) + # print(load_version) # print(save_version) - return catalog, credentials, load_versions, save_version + return catalog, credentials, load_version, save_version @classmethod def from_config( From 49858b6e8211bfc2382b8ae9b30942c39e817b7f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 8 Nov 2024 11:51:01 +0000 Subject: [PATCH 07/78] Removed data key from MemoryDataset Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +++--- kedro/io/core.py | 5 ++++- kedro/io/kedro_data_catalog.py | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 2a3ae8d083..1c7b72caa0 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -265,7 +265,7 @@ def _resolve_config_credentials( def unresolve_config_credentials( cls, ds_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: - ds_config = copy.deepcopy(ds_config) or {} + ds_config_copy = copy.deepcopy(ds_config) or {} credentials = {} credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" @@ -282,9 +282,9 @@ def unresolve(config: Any): if isinstance(val, dict): unresolve(val) - unresolve(ds_config) + unresolve(ds_config_copy) - return ds_config, credentials + return ds_config_copy, credentials def resolve_pattern(self, ds_name: str) -> dict[str, Any]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" diff --git a/kedro/io/core.py b/kedro/io/core.py index 6e0905ab75..b8bf91fb80 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -208,7 +208,6 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: - # TODO: pop data key for MemoryDataset # TODO: check other datasets return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" @@ -222,6 +221,10 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: version = return_config.pop(VERSION_KEY) load_versions, save_version = version.load, version.save + # Pop data from configuration + if type(self).__name__ == "MemoryDataset": + return_config.pop("data", None) + return return_config, load_versions, save_version @property diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index eaf5b4822c..ec03d90220 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -299,8 +299,11 @@ def to_config( save_version[ds_name] = cur_save_version # print(catalog) + # print("-") # print(credentials) + # print("-") # print(load_version) + # print("-") # print(save_version) return catalog, credentials, load_version, save_version From 0d0ba918fb9be806929cda8484297113210000e3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 8 Nov 2024 12:19:18 +0000 Subject: [PATCH 08/78] Added TODOs Signed-off-by: Elena Khaustova --- kedro/io/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro/io/core.py b/kedro/io/core.py index b8bf91fb80..390a73d36e 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -208,6 +208,9 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: + # TODO: pop metadata? + # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work + # TODO: parse CachedDataset config # TODO: check other datasets return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" From 8413f5874ef686d395c1c0674df3ae5b1b8d12e3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 11 Nov 2024 15:26:33 +0000 Subject: [PATCH 09/78] Saved call args Signed-off-by: Elena Khaustova --- kedro/io/core.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 390a73d36e..41e011e619 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -15,6 +15,7 @@ from datetime import datetime, timezone from functools import partial, wraps from glob import iglob +from inspect import getcallargs from operator import attrgetter from pathlib import Path, PurePath, PurePosixPath from typing import ( @@ -149,11 +150,12 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): need to change the `_EPHEMERAL` attribute to 'True'. """ _EPHEMERAL = False - _config: dict[str, Any] = None + _init_args: dict[str, Any] = None - def __post_init__(self, *args, **kwargs): - # TODO: decide what to do with args - self._config = kwargs + def __post_init__(self, call_args: dict[str, Any]): + # print(call_args) + self._init_args = call_args + self._init_args.pop("self", None) @classmethod def from_config( @@ -212,16 +214,17 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work # TODO: parse CachedDataset config # TODO: check other datasets + # print("to_config", signature(self.__init__)) return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } load_versions: dict[str, str] | None = None save_version: str | None = None - return_config.update(self._config) + return_config.update(self._init_args) - if VERSION_KEY in return_config: - version = return_config.pop(VERSION_KEY) + version = return_config.pop(VERSION_KEY, None) + if version: load_versions, save_version = version.load, version.save # Pop data from configuration @@ -316,11 +319,14 @@ def __init_subclass__(cls, **kwargs: Any) -> None: """ + init_func: Callable = cls.__init__ + def init_decorator(previous_init): def new_init(self, *args, **kwargs): previous_init(self, *args, **kwargs) if type(self) is cls: - self.__post_init__(*args, **kwargs) + call_args = getcallargs(init_func, self, *args, **kwargs) + self.__post_init__(call_args) return new_init From a89db7e533def9b33d643955f38c1dcc13e31ff6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 11 Nov 2024 15:43:05 +0000 Subject: [PATCH 10/78] Saved only set credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 +- kedro/io/core.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 1c7b72caa0..5bd2149034 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -275,7 +275,7 @@ def unresolve(config: Any): if credentials: return for key, val in config.items(): - if key == CREDENTIALS_KEY: + if key == CREDENTIALS_KEY and config[key]: credentials[credentials_ref] = config[key] config[key] = credentials_ref return diff --git a/kedro/io/core.py b/kedro/io/core.py index 41e011e619..c35f21bab9 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -214,7 +214,6 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work # TODO: parse CachedDataset config # TODO: check other datasets - # print("to_config", signature(self.__init__)) return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } From b081c655fac240f1d38d46655db249a0c43ec835 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 11 Nov 2024 17:54:10 +0000 Subject: [PATCH 11/78] Processed CachedDataset case Signed-off-by: Elena Khaustova --- kedro/io/core.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c35f21bab9..85f162f0a6 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -212,7 +212,6 @@ def from_config( def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: pop metadata? # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work - # TODO: parse CachedDataset config # TODO: check other datasets return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" @@ -222,9 +221,24 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: return_config.update(self._init_args) + if type(self).__name__ == "CachedDataset": + cached_ds = return_config.pop("dataset") + if isinstance(cached_ds, dict): + cached_ds_return_config = cached_ds + else: + cached_ds_return_config, load_versions, save_version = ( + cached_ds.to_config() + ) + if "versioned" in cached_ds_return_config: + return_config["versioned"] = cached_ds_return_config.pop("versioned") + return_config["dataset"] = cached_ds_return_config + version = return_config.pop(VERSION_KEY, None) if version: - load_versions, save_version = version.load, version.save + load_versions, save_version = ( + load_versions or version.load, + save_version or version.save, + ) # Pop data from configuration if type(self).__name__ == "MemoryDataset": From 18c0ad6a149db2cb1582b6de9a9a06d6a75cc34d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 12 Nov 2024 11:21:34 +0000 Subject: [PATCH 12/78] Updated TODOs Signed-off-by: Elena Khaustova --- kedro/io/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 85f162f0a6..b4d60dd034 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -212,7 +212,10 @@ def from_config( def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: pop metadata? # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work - # TODO: check other datasets + # TODO: check IncrementalDataset/PartitionedDataset + # TODO: check dataset factories + # TODO: check transcoding + # TODO: test loading back return_config = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } From 2751ea83eb3794d91a8b6966c258981ef77c6dea Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 12 Nov 2024 12:05:01 +0000 Subject: [PATCH 13/78] Tested with PartitionedDataset Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index b4d60dd034..ef226b34cf 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -212,7 +212,7 @@ def from_config( def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: pop metadata? # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work - # TODO: check IncrementalDataset/PartitionedDataset + # TODO: check IncrementalDataset # TODO: check dataset factories # TODO: check transcoding # TODO: test loading back From fc576fffed5258919de04613fa818925febf37f2 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 12 Nov 2024 12:36:17 +0000 Subject: [PATCH 14/78] Popped metadata Signed-off-by: Elena Khaustova --- kedro/io/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index ef226b34cf..1a8389fca2 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -210,9 +210,6 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: - # TODO: pop metadata? - # TODO: test with LambdaDataset/SharedMemoryDataset - it won't work - # TODO: check IncrementalDataset # TODO: check dataset factories # TODO: check transcoding # TODO: test loading back @@ -247,6 +244,9 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: if type(self).__name__ == "MemoryDataset": return_config.pop("data", None) + # Pop metadata from configuration + return_config.pop("metadata", None) + return return_config, load_versions, save_version @property From 1d6454c1dece5f2eea606a275f9568ce08f5a8cd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 12 Nov 2024 16:20:27 +0000 Subject: [PATCH 15/78] Fixed versioning when load Signed-off-by: Elena Khaustova --- kedro/io/core.py | 16 +++++++++++----- kedro/io/kedro_data_catalog.py | 12 ++++++++++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 1a8389fca2..5cdcb5bce3 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -153,7 +153,6 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): _init_args: dict[str, Any] = None def __post_init__(self, call_args: dict[str, Any]): - # print(call_args) self._init_args = call_args self._init_args.pop("self", None) @@ -229,11 +228,14 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: cached_ds_return_config, load_versions, save_version = ( cached_ds.to_config() ) - if "versioned" in cached_ds_return_config: - return_config["versioned"] = cached_ds_return_config.pop("versioned") + if VERSIONED_FLAG_KEY in cached_ds_return_config: + return_config[VERSIONED_FLAG_KEY] = cached_ds_return_config.pop( + VERSIONED_FLAG_KEY + ) return_config["dataset"] = cached_ds_return_config version = return_config.pop(VERSION_KEY, None) + if version: load_versions, save_version = ( load_versions or version.load, @@ -542,7 +544,6 @@ def parse_dataset_definition( Returns: 2-tuple: (Dataset class object, configuration dictionary) """ - save_version = save_version or generate_timestamp() config = copy.deepcopy(config) # TODO: remove when removing old catalog as moved to KedroDataCatalog @@ -601,10 +602,15 @@ def parse_dataset_definition( # dataset is either versioned explicitly by the user or versioned is set to true by default # on the dataset + # Included load_version into condition if config.pop(VERSIONED_FLAG_KEY, False) or getattr( - class_obj, VERSIONED_FLAG_KEY, False + class_obj, VERSIONED_FLAG_KEY, False or load_version ): + # print() + # print("Adding flag") config[VERSION_KEY] = Version(load_version, save_version) + # print(config) + # print() return class_obj, config diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index ec03d90220..7e6c896263 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -269,14 +269,22 @@ def to_config( ) -> tuple[ dict[str, dict[str, Any]], dict[str, dict[str, Any]], - dict[str, dict[str, str] | None], + dict[str, str | None], dict[str, str | None], ]: catalog: dict[str, dict[str, Any]] = {} credentials: dict[str, dict[str, Any]] = {} - load_version: dict[str, dict[str, str] | None] = {} + load_version: dict[str, str | None] = {} save_version: dict[str, str | None] = {} + # print() + # print("Lazy:") + # print(self._lazy_datasets.keys()) + # + # print("Normal:") + # print(self._datasets.keys()) + # print() + for ds_name, ds in self._lazy_datasets.items(): unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials(ds_name, ds.config) From 8c312377ef07bd1fa78f197bc30a480d8a5aa939 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 12 Nov 2024 16:43:27 +0000 Subject: [PATCH 16/78] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- kedro/io/core.py | 18 ++++++++++-------- kedro/io/kedro_data_catalog.py | 6 +++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 5bd2149034..6949caa73b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -266,10 +266,10 @@ def unresolve_config_credentials( cls, ds_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: ds_config_copy = copy.deepcopy(ds_config) or {} - credentials = {} + credentials: dict[str, Any] = {} credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" - def unresolve(config: Any): + def unresolve(config: Any) -> None: # We don't expect credentials key appears more than once in the config, # So once we found the key first time we unresolve it and stop iterating after if credentials: diff --git a/kedro/io/core.py b/kedro/io/core.py index 5cdcb5bce3..510d036761 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -150,9 +150,9 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): need to change the `_EPHEMERAL` attribute to 'True'. """ _EPHEMERAL = False - _init_args: dict[str, Any] = None + _init_args: dict[str, Any] | None = None - def __post_init__(self, call_args: dict[str, Any]): + def __post_init__(self, call_args: dict[str, Any]) -> None: self._init_args = call_args self._init_args.pop("self", None) @@ -212,19 +212,21 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # TODO: check dataset factories # TODO: check transcoding # TODO: test loading back - return_config = { + return_config: dict[str, Any] = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } load_versions: dict[str, str] | None = None save_version: str | None = None - return_config.update(self._init_args) + if self._init_args: + return_config.update(self._init_args) if type(self).__name__ == "CachedDataset": cached_ds = return_config.pop("dataset") + cached_ds_return_config: dict[str, Any] = {} if isinstance(cached_ds, dict): cached_ds_return_config = cached_ds - else: + elif isinstance(cached_ds, AbstractDataset): cached_ds_return_config, load_versions, save_version = ( cached_ds.to_config() ) @@ -339,8 +341,8 @@ def __init_subclass__(cls, **kwargs: Any) -> None: init_func: Callable = cls.__init__ - def init_decorator(previous_init): - def new_init(self, *args, **kwargs): + def init_decorator(previous_init: Callable) -> Callable: + def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] previous_init(self, *args, **kwargs) if type(self) is cls: call_args = getcallargs(init_func, self, *args, **kwargs) @@ -348,7 +350,7 @@ def new_init(self, *args, **kwargs): return new_init - cls.__init__ = init_decorator(cls.__init__) + cls.__init__ = init_decorator(cls.__init__) # type: ignore[method-assign] super().__init_subclass__(**kwargs) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 7e6c896263..5f02f31e75 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -96,7 +96,7 @@ def __init__( >>> catalog = KedroDataCatalog(datasets={"cars": cars}) """ self._config_resolver = config_resolver or CatalogConfigResolver() - self._datasets = datasets or {} + self._datasets: dict[str, AbstractDataset] = datasets or {} self._lazy_datasets: dict[str, _LazyDataset] = {} self._load_versions = load_versions or {} self._save_version = save_version @@ -294,8 +294,8 @@ def to_config( load_version[ds_name] = ds.load_version save_version[ds_name] = ds.save_version - for ds_name, ds in self._datasets.items(): - resolved_config, cur_load_versions, cur_save_version = ds.to_config() + for ds_name, ds in self._datasets.items(): # type: ignore[assignment] + resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials( ds_name, resolved_config From e0358812c5e631d773bbbe3c1e134debab6748dd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 13 Nov 2024 11:21:34 +0000 Subject: [PATCH 17/78] Tested datasets factories Signed-off-by: Elena Khaustova --- kedro/io/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 510d036761..4c3383365f 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -209,7 +209,6 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: - # TODO: check dataset factories # TODO: check transcoding # TODO: test loading back return_config: dict[str, Any] = { From edcdc38dd71392385297bac6d3df20e20a4d127e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 13 Nov 2024 11:35:27 +0000 Subject: [PATCH 18/78] Tested transcoding Signed-off-by: Elena Khaustova --- kedro/io/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 4c3383365f..c80bdd69ec 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -209,7 +209,6 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: - # TODO: check transcoding # TODO: test loading back return_config: dict[str, Any] = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" From 15a1e722c82b089c2fd4140a5996599120be0f3a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 13 Nov 2024 12:58:34 +0000 Subject: [PATCH 19/78] Removed TODOs Signed-off-by: Elena Khaustova --- kedro/io/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c80bdd69ec..f704e5fc57 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -209,7 +209,6 @@ def from_config( return dataset def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: - # TODO: test loading back return_config: dict[str, Any] = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } From d4e4534ec1a56fe924dfeb902c8b83fb5cdb2366 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 13 Nov 2024 13:00:10 +0000 Subject: [PATCH 20/78] Removed debug output Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 5f02f31e75..844ea87e88 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -277,14 +277,6 @@ def to_config( load_version: dict[str, str | None] = {} save_version: dict[str, str | None] = {} - # print() - # print("Lazy:") - # print(self._lazy_datasets.keys()) - # - # print("Normal:") - # print(self._datasets.keys()) - # print() - for ds_name, ds in self._lazy_datasets.items(): unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials(ds_name, ds.config) @@ -306,14 +298,6 @@ def to_config( load_version[ds_name] = cur_load_versions save_version[ds_name] = cur_save_version - # print(catalog) - # print("-") - # print(credentials) - # print("-") - # print(load_version) - # print("-") - # print(save_version) - return catalog, credentials, load_version, save_version @classmethod From e7e8af56a129ad687547da8d55aba294158688d7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 13 Nov 2024 13:04:01 +0000 Subject: [PATCH 21/78] Removed debug output Signed-off-by: Elena Khaustova --- kedro/io/core.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f704e5fc57..9db5177edd 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -605,11 +605,7 @@ def parse_dataset_definition( if config.pop(VERSIONED_FLAG_KEY, False) or getattr( class_obj, VERSIONED_FLAG_KEY, False or load_version ): - # print() - # print("Adding flag") config[VERSION_KEY] = Version(load_version, save_version) - # print(config) - # print() return class_obj, config From 1b6be8ed0b28567ec1473944cb0360340d04d118 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 11:50:18 +0000 Subject: [PATCH 22/78] Added logic to set VERSIONED_FLAG_KEY Signed-off-by: Elena Khaustova --- kedro/io/core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 9db5177edd..3d61105663 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -236,6 +236,7 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: version = return_config.pop(VERSION_KEY, None) if version: + return_config[VERSIONED_FLAG_KEY] = True load_versions, save_version = ( load_versions or version.load, save_version or version.save, @@ -543,6 +544,7 @@ def parse_dataset_definition( Returns: 2-tuple: (Dataset class object, configuration dictionary) """ + save_version = save_version or generate_timestamp() config = copy.deepcopy(config) # TODO: remove when removing old catalog as moved to KedroDataCatalog @@ -601,9 +603,8 @@ def parse_dataset_definition( # dataset is either versioned explicitly by the user or versioned is set to true by default # on the dataset - # Included load_version into condition if config.pop(VERSIONED_FLAG_KEY, False) or getattr( - class_obj, VERSIONED_FLAG_KEY, False or load_version + class_obj, VERSIONED_FLAG_KEY, False ): config[VERSION_KEY] = Version(load_version, save_version) From c6dc38035406c3f870eddea5a551a473df8e141f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 12:14:28 +0000 Subject: [PATCH 23/78] Updated version set up Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 844ea87e88..d7622e0783 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -18,6 +18,7 @@ from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( + VERSIONED_FLAG_KEY, AbstractDataset, AbstractVersionedDataset, CatalogProtocol, @@ -283,8 +284,12 @@ def to_config( ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) - load_version[ds_name] = ds.load_version - save_version[ds_name] = ds.save_version + if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): + load_version[ds_name] = ds.load_version + save_version[ds_name] = ds.save_version + else: + load_version[ds_name] = None + save_version[ds_name] = None for ds_name, ds in self._datasets.items(): # type: ignore[assignment] resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] From 54b0793386c9e7ea3caa5ee0b259e5ecad237a1d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 12:34:16 +0000 Subject: [PATCH 24/78] Added TODO for versioning Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index d7622e0783..937794546c 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -284,6 +284,7 @@ def to_config( ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) + # TODO: Update when resolve #4327 if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): load_version[ds_name] = ds.load_version save_version[ds_name] = ds.save_version From f183e609d08cf3bf38682e16c067f47807909850 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 15:44:48 +0000 Subject: [PATCH 25/78] Added tests for unresolve_config_credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 13 +++++------ tests/io/test_kedro_data_catalog.py | 36 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 6949caa73b..2da4af9942 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -237,8 +237,9 @@ def _extract_patterns( return sorted_patterns, user_default + @classmethod def _resolve_config_credentials( - self, + cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: @@ -254,8 +255,8 @@ def _resolve_config_credentials( "\nHint: If this catalog entry is intended for variable interpolation, " "make sure that the key is preceded by an underscore." ) - if not self.is_pattern(ds_name): - resolved_configs[ds_name] = self._resolve_credentials( + if not cls.is_pattern(ds_name): + resolved_configs[ds_name] = cls._resolve_credentials( ds_config, credentials ) @@ -263,17 +264,15 @@ def _resolve_config_credentials( @classmethod def unresolve_config_credentials( - cls, ds_name: str, ds_config: dict[str, dict[str, Any]] | None + cls, cred_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: ds_config_copy = copy.deepcopy(ds_config) or {} credentials: dict[str, Any] = {} - credentials_ref = f"{ds_name}_{CREDENTIALS_KEY}" + credentials_ref = f"{cred_name}_{CREDENTIALS_KEY}" def unresolve(config: Any) -> None: # We don't expect credentials key appears more than once in the config, # So once we found the key first time we unresolve it and stop iterating after - if credentials: - return for key, val in config.items(): if key == CREDENTIALS_KEY and config[key]: credentials[credentials_ref] = config[key] diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 367580ef80..22a37e441a 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from kedro.io import ( + CatalogConfigResolver, DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, @@ -431,6 +432,41 @@ def test_missing_nested_credentials(self, correct_config_with_nested_creds): with pytest.raises(KeyError, match=pattern): KedroDataCatalog.from_config(**correct_config_with_nested_creds) + def test_unresolve_config_credentials(self, correct_config): + """Test unresolve dataset credentials to original format.""" + config = correct_config["catalog"] + credentials = correct_config["credentials"] + resolved_configs = CatalogConfigResolver._resolve_config_credentials( + config, credentials + ) + + unresolved_config, unresolved_credentials = ( + CatalogConfigResolver.unresolve_config_credentials( + cred_name="s3", ds_config=resolved_configs + ) + ) + assert config == unresolved_config + assert credentials == unresolved_credentials + + def test_unresolve_config_credentials_two_keys(self, correct_config): + """Test unresolve dataset credentials to original format when two credentials keys provided.""" + config = correct_config["catalog"] + credentials = correct_config["credentials"] + + resolved_configs = CatalogConfigResolver._resolve_config_credentials( + config, credentials + ) + resolved_configs["cars"]["metadata"] = {"credentials": {}} + + unresolved_config, unresolved_credentials = ( + CatalogConfigResolver.unresolve_config_credentials( + cred_name="s3", ds_config=resolved_configs + ) + ) + unresolved_config["cars"].pop("metadata") + assert config == unresolved_config + assert credentials == unresolved_credentials + def test_missing_dependency(self, correct_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" From 0d9d2415a120d4cd33e3c3a5bea6982b9de5433c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 16:36:24 +0000 Subject: [PATCH 26/78] Implemented test_to_config Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 21 +++++++++++++++++++++ tests/io/test_kedro_data_catalog.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 9abce4c83e..187891e4be 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -51,6 +51,27 @@ def correct_config(filepath): } +@pytest.fixture +def correct_config_versioned(filepath): + return { + "catalog": { + "boats": { + "type": "pandas.CSVDataset", + "filepath": filepath, + "versioned": True, + }, + "cars": { + "type": "pandas.CSVDataset", + "filepath": "s3://test_bucket/test_file.csv", + "credentials": "cars_credentials", + }, + }, + "credentials": { + "cars_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} + }, + } + + @pytest.fixture def correct_config_with_nested_creds(correct_config): correct_config["catalog"]["cars"]["credentials"] = { diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 22a37e441a..57139916b9 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -293,6 +293,32 @@ def test_release(self, data_catalog): """Test release is called without errors""" data_catalog.release("test") + class TestKedroDataCatalogToConfig: + def test_to_config(self, correct_config_versioned, dataset, filepath): + config = correct_config_versioned["catalog"] + credentials = correct_config_versioned["credentials"] + catalog = KedroDataCatalog.from_config(config, credentials) + catalog["resolved_ds"] = dataset + + catalog_config, catalog_credentials, load_version, save_version = ( + catalog.to_config() + ) + + expected_config = { + "resolved_ds": { + "type": "kedro_datasets.pandas.csv_dataset.CSVDataset", + "filepath": filepath, + "save_args": {"index": False}, + "load_args": None, + "credentials": None, + "fs_args": None, + } + } + expected_config.update(config) + + assert catalog_config == expected_config + assert catalog_credentials == credentials + class TestKedroDataCatalogFromConfig: def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): """Test populating the data catalog from config""" From 763e635d231041631b1f24ccfa2040f71c0d368a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 16:54:43 +0000 Subject: [PATCH 27/78] Added test with MemoryDataset Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 57139916b9..d2293a4de0 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -295,10 +295,12 @@ def test_release(self, data_catalog): class TestKedroDataCatalogToConfig: def test_to_config(self, correct_config_versioned, dataset, filepath): + """Test dumping catalog config""" config = correct_config_versioned["catalog"] credentials = correct_config_versioned["credentials"] catalog = KedroDataCatalog.from_config(config, credentials) catalog["resolved_ds"] = dataset + catalog["memory_ds"] = [1, 2, 3] catalog_config, catalog_credentials, load_version, save_version = ( catalog.to_config() @@ -312,7 +314,11 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): "load_args": None, "credentials": None, "fs_args": None, - } + }, + "memory_ds": { + "type": "kedro.io.memory_dataset.MemoryDataset", + "copy_mode": None, + }, } expected_config.update(config) From 8795dd618d4424b085cc09dcfb3c05571f643402 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 17:23:51 +0000 Subject: [PATCH 28/78] Extended test examples Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 21 ++++++++++++++++++++- tests/io/test_kedro_data_catalog.py | 25 +++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 187891e4be..9fc6576f5a 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -65,9 +65,28 @@ def correct_config_versioned(filepath): "filepath": "s3://test_bucket/test_file.csv", "credentials": "cars_credentials", }, + "cars_ibis": { + "type": "ibis.FileDataset", + "filepath": "cars_ibis.csv", + "file_format": "csv", + "table_name": "cars", + "connection": {"backend": "duckdb", "database": "company.db"}, + "load_args": {"sep": ",", "nullstr": "#NA"}, + "save_args": {"sep": ",", "nullstr": "#NA"}, + }, + "cached_ds": { + "type": "CachedDataset", + "versioned": "true", + "dataset": { + "type": "pandas.CSVDataset", + "filepath": "cached_ds.csv", + "credentials": "cached_ds_credentials", + }, + }, }, "credentials": { - "cars_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} + "cars_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}, + "cached_ds_credentials": {"key": "KEY", "secret": "SECRET"}, }, } diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index d2293a4de0..e96a9e8d01 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -10,6 +10,7 @@ from pandas.testing import assert_frame_equal from kedro.io import ( + CachedDataset, CatalogConfigResolver, DatasetAlreadyExistsError, DatasetError, @@ -21,6 +22,7 @@ from kedro.io.core import ( _DEFAULT_PACKAGES, VERSION_FORMAT, + Version, generate_timestamp, parse_dataset_definition, ) @@ -302,6 +304,16 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): catalog["resolved_ds"] = dataset catalog["memory_ds"] = [1, 2, 3] + version = Version( + load="fake_load_version.csv", # load exact version + save="fake_save_version.csv", # save to exact version + ) + versioned_dataset = CSVDataset( + filepath="shuttles.csv", version=version, metadata=[1, 2, 3] + ) + cached_versioned_dataset = CachedDataset(dataset=versioned_dataset) + catalog["cached_versioned_dataset"] = cached_versioned_dataset + catalog_config, catalog_credentials, load_version, save_version = ( catalog.to_config() ) @@ -319,6 +331,19 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): "type": "kedro.io.memory_dataset.MemoryDataset", "copy_mode": None, }, + "cached_versioned_dataset": { + "type": "kedro.io.cached_dataset.CachedDataset", + "copy_mode": None, + "versioned": True, + "dataset": { + "type": "kedro_datasets.pandas.csv_dataset.CSVDataset", + "filepath": "shuttles.csv", + "load_args": None, + "save_args": None, + "credentials": None, + "fs_args": None, + }, + }, } expected_config.update(config) From e3289b47fd7b1d197d79a03e55fd0da4651d1125 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 14 Nov 2024 17:40:13 +0000 Subject: [PATCH 29/78] Materialized cached_ds Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- tests/io/conftest.py | 5 +++-- tests/io/test_kedro_data_catalog.py | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 937794546c..00e73ca8b9 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -284,7 +284,7 @@ def to_config( ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) - # TODO: Update when resolve #4327 + # TODO: Update when #4327 resolved if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): load_version[ds_name] = ds.load_version save_version[ds_name] = ds.save_version diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 9fc6576f5a..47d783f542 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -75,13 +75,14 @@ def correct_config_versioned(filepath): "save_args": {"sep": ",", "nullstr": "#NA"}, }, "cached_ds": { - "type": "CachedDataset", - "versioned": "true", + "type": "kedro.io.cached_dataset.CachedDataset", + "versioned": True, "dataset": { "type": "pandas.CSVDataset", "filepath": "cached_ds.csv", "credentials": "cached_ds_credentials", }, + "copy_mode": None, }, }, "credentials": { diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index e96a9e8d01..7d88bf8c5c 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -303,6 +303,8 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): catalog = KedroDataCatalog.from_config(config, credentials) catalog["resolved_ds"] = dataset catalog["memory_ds"] = [1, 2, 3] + # Materialize cached_ds + _ = catalog["cached_ds"] version = Version( load="fake_load_version.csv", # load exact version @@ -347,6 +349,8 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): } expected_config.update(config) + # TODO: Add expected load/save versions when #4327 resolved + assert catalog_config == expected_config assert catalog_credentials == credentials From ae628864e22be36570173a60143904631531ea1c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 18 Nov 2024 10:22:19 +0000 Subject: [PATCH 30/78] Exclude parameters Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 6 +----- kedro/io/core.py | 5 +++++ kedro/io/kedro_data_catalog.py | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 25fad6083d..84098f70c6 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -13,6 +13,7 @@ from kedro.framework.cli.utils import KedroCliError, env_option, split_string from kedro.framework.project import pipelines, settings from kedro.framework.session import KedroSession +from kedro.io import is_parameter from kedro.io.data_catalog import DataCatalog if TYPE_CHECKING: @@ -27,11 +28,6 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession: return KedroSession.create(**kwargs) -def is_parameter(dataset_name: str) -> bool: - """Check if dataset is a parameter.""" - return dataset_name.startswith("params:") or dataset_name == "parameters" - - @click.group(name="Kedro") def catalog_cli() -> None: # pragma: no cover pass diff --git a/kedro/io/core.py b/kedro/io/core.py index 3d61105663..629f90c1b0 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1019,3 +1019,8 @@ def confirm(self, name: str) -> None: def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: """Returns a shallow copy of the current object.""" ... + + +def is_parameter(dataset_name: str) -> bool: + """Check if dataset is a parameter.""" + return dataset_name.startswith("params:") or dataset_name == "parameters" diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 00e73ca8b9..61af97e2f9 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -27,6 +27,7 @@ DatasetNotFoundError, Version, generate_timestamp, + is_parameter, ) from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler @@ -279,6 +280,8 @@ def to_config( save_version: dict[str, str | None] = {} for ds_name, ds in self._lazy_datasets.items(): + if is_parameter(ds_name): + continue unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials(ds_name, ds.config) ) @@ -293,6 +296,8 @@ def to_config( save_version[ds_name] = None for ds_name, ds in self._datasets.items(): # type: ignore[assignment] + if is_parameter(ds_name): + continue resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials( From b2ebfe234cb35a4c69fced6b63fb677af816789d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 18 Nov 2024 10:25:54 +0000 Subject: [PATCH 31/78] Fixed import Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 84098f70c6..851dd6951a 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -13,7 +13,7 @@ from kedro.framework.cli.utils import KedroCliError, env_option, split_string from kedro.framework.project import pipelines, settings from kedro.framework.session import KedroSession -from kedro.io import is_parameter +from kedro.io.core import is_parameter from kedro.io.data_catalog import DataCatalog if TYPE_CHECKING: From a07107aa194aacd4f139f356fb56236ec4474268 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 18 Nov 2024 10:51:58 +0000 Subject: [PATCH 32/78] Added test with parameters Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 5 +++++ tests/io/test_kedro_data_catalog.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 47d783f542..ef137372b2 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -84,6 +84,11 @@ def correct_config_versioned(filepath): }, "copy_mode": None, }, + "parameters": { + "type": "kedro.io.memory_dataset.MemoryDataset", + "data": [4, 5, 6], + "copy_mode": None, + }, }, "credentials": { "cars_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"}, diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 7d88bf8c5c..7833dcebc6 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -303,6 +303,7 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): catalog = KedroDataCatalog.from_config(config, credentials) catalog["resolved_ds"] = dataset catalog["memory_ds"] = [1, 2, 3] + catalog["params:a.b"] = {"abc": "def"} # Materialize cached_ds _ = catalog["cached_ds"] @@ -348,6 +349,7 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): }, } expected_config.update(config) + expected_config.pop("parameters", None) # TODO: Add expected load/save versions when #4327 resolved From e5adb5d52d72a24c0c26d689b767c51ededcf6f6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 19 Nov 2024 15:25:49 +0000 Subject: [PATCH 33/78] Moved tests for CatalogConfigResolver to a separate file Signed-off-by: Elena Khaustova --- tests/io/test_catalog_config_resolver.py | 38 ++++++++++++++++++++++++ tests/io/test_kedro_data_catalog.py | 36 ---------------------- 2 files changed, 38 insertions(+), 36 deletions(-) create mode 100644 tests/io/test_catalog_config_resolver.py diff --git a/tests/io/test_catalog_config_resolver.py b/tests/io/test_catalog_config_resolver.py new file mode 100644 index 0000000000..ddeccde34f --- /dev/null +++ b/tests/io/test_catalog_config_resolver.py @@ -0,0 +1,38 @@ +from kedro.io import CatalogConfigResolver + + +class TestCatalogConfigResolver: + def test_unresolve_config_credentials(self, correct_config): + """Test unresolve dataset credentials to original format.""" + config = correct_config["catalog"] + credentials = correct_config["credentials"] + resolved_configs = CatalogConfigResolver._resolve_config_credentials( + config, credentials + ) + + unresolved_config, unresolved_credentials = ( + CatalogConfigResolver.unresolve_config_credentials( + cred_name="s3", ds_config=resolved_configs + ) + ) + assert config == unresolved_config + assert credentials == unresolved_credentials + + def test_unresolve_config_credentials_two_keys(self, correct_config): + """Test unresolve dataset credentials to original format when two credentials keys provided.""" + config = correct_config["catalog"] + credentials = correct_config["credentials"] + + resolved_configs = CatalogConfigResolver._resolve_config_credentials( + config, credentials + ) + resolved_configs["cars"]["metadata"] = {"credentials": {}} + + unresolved_config, unresolved_credentials = ( + CatalogConfigResolver.unresolve_config_credentials( + cred_name="s3", ds_config=resolved_configs + ) + ) + unresolved_config["cars"].pop("metadata") + assert config == unresolved_config + assert credentials == unresolved_credentials diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 7833dcebc6..34db055127 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -11,7 +11,6 @@ from kedro.io import ( CachedDataset, - CatalogConfigResolver, DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, @@ -495,41 +494,6 @@ def test_missing_nested_credentials(self, correct_config_with_nested_creds): with pytest.raises(KeyError, match=pattern): KedroDataCatalog.from_config(**correct_config_with_nested_creds) - def test_unresolve_config_credentials(self, correct_config): - """Test unresolve dataset credentials to original format.""" - config = correct_config["catalog"] - credentials = correct_config["credentials"] - resolved_configs = CatalogConfigResolver._resolve_config_credentials( - config, credentials - ) - - unresolved_config, unresolved_credentials = ( - CatalogConfigResolver.unresolve_config_credentials( - cred_name="s3", ds_config=resolved_configs - ) - ) - assert config == unresolved_config - assert credentials == unresolved_credentials - - def test_unresolve_config_credentials_two_keys(self, correct_config): - """Test unresolve dataset credentials to original format when two credentials keys provided.""" - config = correct_config["catalog"] - credentials = correct_config["credentials"] - - resolved_configs = CatalogConfigResolver._resolve_config_credentials( - config, credentials - ) - resolved_configs["cars"]["metadata"] = {"credentials": {}} - - unresolved_config, unresolved_credentials = ( - CatalogConfigResolver.unresolve_config_credentials( - cred_name="s3", ds_config=resolved_configs - ) - ) - unresolved_config["cars"].pop("metadata") - assert config == unresolved_config - assert credentials == unresolved_credentials - def test_missing_dependency(self, correct_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" From bdf45a359c833fcc2c7193dd5dfc96fc42a2abcc Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 19 Nov 2024 15:32:13 +0000 Subject: [PATCH 34/78] Made unresolve_config_credentials staticmethod Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 2da4af9942..e2b9a764e1 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -262,9 +262,9 @@ def _resolve_config_credentials( return resolved_configs - @classmethod + @staticmethod def unresolve_config_credentials( - cls, cred_name: str, ds_config: dict[str, dict[str, Any]] | None + cred_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: ds_config_copy = copy.deepcopy(ds_config) or {} credentials: dict[str, Any] = {} From 33d679113d1d9db7ef7510b44d3382e95ca717e6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 19 Nov 2024 15:33:41 +0000 Subject: [PATCH 35/78] Updated comment to clarify meaning Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index e2b9a764e1..725c56e21e 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -271,7 +271,7 @@ def unresolve_config_credentials( credentials_ref = f"{cred_name}_{CREDENTIALS_KEY}" def unresolve(config: Any) -> None: - # We don't expect credentials key appears more than once in the config, + # We don't expect credentials key appears more than once within the same dataset config, # So once we found the key first time we unresolve it and stop iterating after for key, val in config.items(): if key == CREDENTIALS_KEY and config[key]: From 33ff979fe8a971ac6ad44965899a079d61ce0e17 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 19 Nov 2024 15:37:24 +0000 Subject: [PATCH 36/78] Moved to_config anfter from_config Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 90 +++++++++++++++++----------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 61af97e2f9..0b97e19371 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -266,51 +266,6 @@ def _ipython_key_completions_(self) -> list[str]: def _logger(self) -> logging.Logger: return logging.getLogger(__name__) - def to_config( - self, - ) -> tuple[ - dict[str, dict[str, Any]], - dict[str, dict[str, Any]], - dict[str, str | None], - dict[str, str | None], - ]: - catalog: dict[str, dict[str, Any]] = {} - credentials: dict[str, dict[str, Any]] = {} - load_version: dict[str, str | None] = {} - save_version: dict[str, str | None] = {} - - for ds_name, ds in self._lazy_datasets.items(): - if is_parameter(ds_name): - continue - unresolved_config, unresolved_credentials = ( - self._config_resolver.unresolve_config_credentials(ds_name, ds.config) - ) - catalog[ds_name] = unresolved_config - credentials.update(unresolved_credentials) - # TODO: Update when #4327 resolved - if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): - load_version[ds_name] = ds.load_version - save_version[ds_name] = ds.save_version - else: - load_version[ds_name] = None - save_version[ds_name] = None - - for ds_name, ds in self._datasets.items(): # type: ignore[assignment] - if is_parameter(ds_name): - continue - resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] - unresolved_config, unresolved_credentials = ( - self._config_resolver.unresolve_config_credentials( - ds_name, resolved_config - ) - ) - catalog[ds_name] = unresolved_config - credentials.update(unresolved_credentials) - load_version[ds_name] = cur_load_versions - save_version[ds_name] = cur_save_version - - return catalog, credentials, load_version, save_version - @classmethod def from_config( cls, @@ -411,6 +366,51 @@ class to be loaded is specified with the key ``type`` and their config_resolver=config_resolver, ) + def to_config( + self, + ) -> tuple[ + dict[str, dict[str, Any]], + dict[str, dict[str, Any]], + dict[str, str | None], + dict[str, str | None], + ]: + catalog: dict[str, dict[str, Any]] = {} + credentials: dict[str, dict[str, Any]] = {} + load_version: dict[str, str | None] = {} + save_version: dict[str, str | None] = {} + + for ds_name, ds in self._lazy_datasets.items(): + if is_parameter(ds_name): + continue + unresolved_config, unresolved_credentials = ( + self._config_resolver.unresolve_config_credentials(ds_name, ds.config) + ) + catalog[ds_name] = unresolved_config + credentials.update(unresolved_credentials) + # TODO: Update when #4327 resolved + if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): + load_version[ds_name] = ds.load_version + save_version[ds_name] = ds.save_version + else: + load_version[ds_name] = None + save_version[ds_name] = None + + for ds_name, ds in self._datasets.items(): # type: ignore[assignment] + if is_parameter(ds_name): + continue + resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] + unresolved_config, unresolved_credentials = ( + self._config_resolver.unresolve_config_credentials( + ds_name, resolved_config + ) + ) + catalog[ds_name] = unresolved_config + credentials.update(unresolved_credentials) + load_version[ds_name] = cur_load_versions + save_version[ds_name] = cur_save_version + + return catalog, credentials, load_version, save_version + @staticmethod def _validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): From 7546540db05d478b6e82acf38e2ebbb6379fece5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 19 Nov 2024 15:46:44 +0000 Subject: [PATCH 37/78] Returned is_parameter for catalog and added TODOs Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 7 ++++++- kedro/io/core.py | 3 ++- kedro/io/kedro_data_catalog.py | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 851dd6951a..af4fa7a6ea 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -13,7 +13,6 @@ from kedro.framework.cli.utils import KedroCliError, env_option, split_string from kedro.framework.project import pipelines, settings from kedro.framework.session import KedroSession -from kedro.io.core import is_parameter from kedro.io.data_catalog import DataCatalog if TYPE_CHECKING: @@ -28,6 +27,12 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession: return KedroSession.create(**kwargs) +def is_parameter(dataset_name: str) -> bool: + # TODO: when breaking change replace with is_parameter from kedro/io/core.py + """Check if dataset is a parameter.""" + return dataset_name.startswith("params:") or dataset_name == "parameters" + + @click.group(name="Kedro") def catalog_cli() -> None: # pragma: no cover pass diff --git a/kedro/io/core.py b/kedro/io/core.py index 629f90c1b0..f3bd8d73f4 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1021,6 +1021,7 @@ def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: ... -def is_parameter(dataset_name: str) -> bool: +def _is_parameter(dataset_name: str) -> bool: + # TODO: when breaking change replace with is_parameter and remove is_parameter from kedro/framework/cli/catalog.py """Check if dataset is a parameter.""" return dataset_name.startswith("params:") or dataset_name == "parameters" diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 0b97e19371..2e954ab590 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -26,8 +26,8 @@ DatasetError, DatasetNotFoundError, Version, + _is_parameter, generate_timestamp, - is_parameter, ) from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler @@ -380,7 +380,7 @@ def to_config( save_version: dict[str, str | None] = {} for ds_name, ds in self._lazy_datasets.items(): - if is_parameter(ds_name): + if _is_parameter(ds_name): continue unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_config_credentials(ds_name, ds.config) @@ -396,7 +396,7 @@ def to_config( save_version[ds_name] = None for ds_name, ds in self._datasets.items(): # type: ignore[assignment] - if is_parameter(ds_name): + if _is_parameter(ds_name): continue resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( From c37c04ddc92a8d901860c161d99b34ed0f5252ab Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 20 Nov 2024 13:17:07 +0000 Subject: [PATCH 38/78] Renamed catalog config resolver methods Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +++--- kedro/io/kedro_data_catalog.py | 6 ++---- tests/io/test_catalog_config_resolver.py | 12 ++++++------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 725c56e21e..41299f77d6 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -30,7 +30,7 @@ def __init__( self._dataset_patterns, self._default_pattern = self._extract_patterns( config, credentials ) - self._resolved_configs = self._resolve_config_credentials(config, credentials) + self._resolved_configs = self.resolve_credentials(config, credentials) @property def config(self) -> dict[str, dict[str, Any]]: @@ -238,7 +238,7 @@ def _extract_patterns( return sorted_patterns, user_default @classmethod - def _resolve_config_credentials( + def resolve_credentials( cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, @@ -263,7 +263,7 @@ def _resolve_config_credentials( return resolved_configs @staticmethod - def unresolve_config_credentials( + def unresolve_credentials( cred_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: ds_config_copy = copy.deepcopy(ds_config) or {} diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 2e954ab590..5123066701 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -383,7 +383,7 @@ def to_config( if _is_parameter(ds_name): continue unresolved_config, unresolved_credentials = ( - self._config_resolver.unresolve_config_credentials(ds_name, ds.config) + self._config_resolver.unresolve_credentials(ds_name, ds.config) ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) @@ -400,9 +400,7 @@ def to_config( continue resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( - self._config_resolver.unresolve_config_credentials( - ds_name, resolved_config - ) + self._config_resolver.unresolve_credentials(ds_name, resolved_config) ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) diff --git a/tests/io/test_catalog_config_resolver.py b/tests/io/test_catalog_config_resolver.py index ddeccde34f..db5ee6741c 100644 --- a/tests/io/test_catalog_config_resolver.py +++ b/tests/io/test_catalog_config_resolver.py @@ -2,34 +2,34 @@ class TestCatalogConfigResolver: - def test_unresolve_config_credentials(self, correct_config): + def test_unresolve_credentials(self, correct_config): """Test unresolve dataset credentials to original format.""" config = correct_config["catalog"] credentials = correct_config["credentials"] - resolved_configs = CatalogConfigResolver._resolve_config_credentials( + resolved_configs = CatalogConfigResolver.resolve_credentials( config, credentials ) unresolved_config, unresolved_credentials = ( - CatalogConfigResolver.unresolve_config_credentials( + CatalogConfigResolver.unresolve_credentials( cred_name="s3", ds_config=resolved_configs ) ) assert config == unresolved_config assert credentials == unresolved_credentials - def test_unresolve_config_credentials_two_keys(self, correct_config): + def test_unresolve_credentials_two_keys(self, correct_config): """Test unresolve dataset credentials to original format when two credentials keys provided.""" config = correct_config["catalog"] credentials = correct_config["credentials"] - resolved_configs = CatalogConfigResolver._resolve_config_credentials( + resolved_configs = CatalogConfigResolver.resolve_credentials( config, credentials ) resolved_configs["cars"]["metadata"] = {"credentials": {}} unresolved_config, unresolved_credentials = ( - CatalogConfigResolver.unresolve_config_credentials( + CatalogConfigResolver.unresolve_credentials( cred_name="s3", ds_config=resolved_configs ) ) From 591f4a0d617584606b7f67ff02f3e5907addd3a8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 21 Nov 2024 17:11:20 +0000 Subject: [PATCH 39/78] Implemented _validate_versions method Signed-off-by: Elena Khaustova --- kedro/io/core.py | 9 +++++++++ kedro/io/kedro_data_catalog.py | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/kedro/io/core.py b/kedro/io/core.py index 01e85963b9..a73dfd01a2 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -94,6 +94,15 @@ class VersionNotFoundError(DatasetError): pass +class VersionAlreadyExistsError(DatasetError): + """``VersioIsAmbiguousError`` raised by ``DataCatalog`` and ``KedroDataCatalog`` + classes in case of trying to add a datasets to the catalog with a save version + different from the one set for catalog. + """ + + pass + + _DI = TypeVar("_DI") _DO = TypeVar("_DO") diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 8bbf573d7e..7fd17339bc 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -25,6 +25,7 @@ DatasetError, DatasetNotFoundError, Version, + VersionAlreadyExistsError, generate_timestamp, ) from kedro.io.memory_dataset import MemoryDataset @@ -660,3 +661,26 @@ def exists(self, name: str) -> bool: except DatasetNotFoundError: return False return dataset.exists() + + @staticmethod + def _validate_versions( + datasets: dict[str, AbstractDataset] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> tuple[dict[str, str] | None, str | None]: + cur_save_version = save_version + cur_load_versions = load_versions or {} + for ds_name, ds in datasets.items(): + if isinstance(ds, AbstractVersionedDataset) and ds._version: + if ds._version.load: + cur_load_versions[ds_name] = ds._version.load + if ds._version.save: + cur_save_version = cur_save_version or ds._version.save + if cur_save_version != ds._version.save: + raise VersionAlreadyExistsError( + f"Cannot add a dataset `{ds_name}` with `{ds._version.save}` save version. " + f"Save version set for the catalog is `{cur_save_version}`" + f"All datasets in the catalog must have the same Save version." + ) + + return cur_save_version, cur_load_versions From 5aaebe63942e80d704baf797c04fa9f5785baae6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 21 Nov 2024 17:23:32 +0000 Subject: [PATCH 40/78] Added _validate_versions calls Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 7fd17339bc..c7f012414b 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -96,6 +96,10 @@ def __init__( >>> save_args={"index": False}) >>> catalog = KedroDataCatalog(datasets={"cars": cars}) """ + load_versions, save_version = self._validate_versions( + datasets, load_versions, save_version + ) + self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} self._lazy_datasets: dict[str, _LazyDataset] = {} @@ -219,6 +223,9 @@ def __setitem__(self, key: str, value: Any) -> None: if key in self._datasets: self._logger.warning("Replacing dataset '%s'", key) if isinstance(value, AbstractDataset): + self._load_versions, self._save_version = self._validate_versions( + {key: value}, self._load_versions, self._save_version + ) self._datasets[key] = value elif isinstance(value, _LazyDataset): self._lazy_datasets[key] = value From bdb7cf6f40984d658d565746b397f4f5e68dc59d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 21 Nov 2024 17:25:44 +0000 Subject: [PATCH 41/78] Updated error descriptions Signed-off-by: Elena Khaustova --- kedro/io/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index a73dfd01a2..4d1a94d887 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -71,16 +71,16 @@ class DatasetError(Exception): class DatasetNotFoundError(DatasetError): - """``DatasetNotFoundError`` raised by ``DataCatalog`` class in case of - trying to use a non-existing dataset. + """``DatasetNotFoundError`` raised by ```DataCatalog`` and ``KedroDataCatalog`` + classes in case of trying to use a non-existing dataset. """ pass class DatasetAlreadyExistsError(DatasetError): - """``DatasetAlreadyExistsError`` raised by ``DataCatalog`` class in case - of trying to add a dataset which already exists in the ``DataCatalog``. + """``DatasetAlreadyExistsError`` raised by ```DataCatalog`` and ``KedroDataCatalog`` + classes in case of trying to add a dataset which already exists in the ``DataCatalog``. """ pass From e2ffeaa6d8a74fa7beabce73994a706626562694 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 10:23:11 +0000 Subject: [PATCH 42/78] Added validation to the old catalog Signed-off-by: Elena Khaustova --- kedro/io/core.py | 26 ++++++++++++++++++++++++++ kedro/io/data_catalog.py | 9 ++++++++- kedro/io/kedro_data_catalog.py | 29 +++-------------------------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 4d1a94d887..c81547b3d7 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -964,3 +964,29 @@ def confirm(self, name: str) -> None: def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: """Returns a shallow copy of the current object.""" ... + + +def _validate_versions( + datasets: dict[str, AbstractDataset] | None, + load_versions: dict[str, str] | None, + save_version: str | None, +) -> tuple[dict[str, str] | None, str | None]: + if not datasets: + return load_versions, save_version + + cur_save_version = save_version + cur_load_versions = load_versions or {} + for ds_name, ds in datasets.items(): + if isinstance(ds, AbstractVersionedDataset) and ds._version: + if ds._version.load: + cur_load_versions[ds_name] = ds._version.load + if ds._version.save: + cur_save_version = cur_save_version or ds._version.save + if cur_save_version != ds._version.save: + raise VersionAlreadyExistsError( + f"Cannot add a dataset `{ds_name}` with `{ds._version.save}` save version. " + f"Save version set for the catalog is `{cur_save_version}`" + f"All datasets in the catalog must have the same Save version." + ) + + return cur_load_versions, cur_save_version diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 6f9a678272..f5ac8bfcf6 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -25,6 +25,7 @@ DatasetError, DatasetNotFoundError, Version, + _validate_versions, generate_timestamp, ) from kedro.io.memory_dataset import MemoryDataset @@ -159,8 +160,11 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ - self._config_resolver = config_resolver or CatalogConfigResolver() + load_versions, save_version = _validate_versions( + datasets, load_versions, save_version + ) + self._config_resolver = config_resolver or CatalogConfigResolver() # Kept to avoid breaking changes if not config_resolver: self._config_resolver._dataset_patterns = dataset_patterns or {} @@ -506,6 +510,9 @@ def add( raise DatasetAlreadyExistsError( f"Dataset '{dataset_name}' has already been registered" ) + self._load_versions, self._save_version = _validate_versions( + {dataset_name: dataset}, self._load_versions, self._save_version + ) self._datasets[dataset_name] = dataset self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index c7f012414b..956e8fef33 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -25,7 +25,7 @@ DatasetError, DatasetNotFoundError, Version, - VersionAlreadyExistsError, + _validate_versions, generate_timestamp, ) from kedro.io.memory_dataset import MemoryDataset @@ -96,7 +96,7 @@ def __init__( >>> save_args={"index": False}) >>> catalog = KedroDataCatalog(datasets={"cars": cars}) """ - load_versions, save_version = self._validate_versions( + load_versions, save_version = _validate_versions( datasets, load_versions, save_version ) @@ -223,7 +223,7 @@ def __setitem__(self, key: str, value: Any) -> None: if key in self._datasets: self._logger.warning("Replacing dataset '%s'", key) if isinstance(value, AbstractDataset): - self._load_versions, self._save_version = self._validate_versions( + self._load_versions, self._save_version = _validate_versions( {key: value}, self._load_versions, self._save_version ) self._datasets[key] = value @@ -668,26 +668,3 @@ def exists(self, name: str) -> bool: except DatasetNotFoundError: return False return dataset.exists() - - @staticmethod - def _validate_versions( - datasets: dict[str, AbstractDataset] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> tuple[dict[str, str] | None, str | None]: - cur_save_version = save_version - cur_load_versions = load_versions or {} - for ds_name, ds in datasets.items(): - if isinstance(ds, AbstractVersionedDataset) and ds._version: - if ds._version.load: - cur_load_versions[ds_name] = ds._version.load - if ds._version.save: - cur_save_version = cur_save_version or ds._version.save - if cur_save_version != ds._version.save: - raise VersionAlreadyExistsError( - f"Cannot add a dataset `{ds_name}` with `{ds._version.save}` save version. " - f"Save version set for the catalog is `{cur_save_version}`" - f"All datasets in the catalog must have the same Save version." - ) - - return cur_save_version, cur_load_versions From 6b1e802c31af22ec976d5a1c01859405c967409b Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 10:34:14 +0000 Subject: [PATCH 43/78] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/core.py | 6 +++--- kedro/io/data_catalog.py | 11 ++++------- kedro/io/kedro_data_catalog.py | 9 +++------ 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c81547b3d7..d9ff64239c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -968,14 +968,14 @@ def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: def _validate_versions( datasets: dict[str, AbstractDataset] | None, - load_versions: dict[str, str] | None, + load_versions: dict[str, str], save_version: str | None, -) -> tuple[dict[str, str] | None, str | None]: +) -> tuple[dict[str, str], str | None]: if not datasets: return load_versions, save_version cur_save_version = save_version - cur_load_versions = load_versions or {} + cur_load_versions = load_versions for ds_name, ds in datasets.items(): if isinstance(ds, AbstractVersionedDataset) and ds._version: if ds._version.load: diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f5ac8bfcf6..42863c735f 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -160,24 +160,21 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ - load_versions, save_version = _validate_versions( - datasets, load_versions, save_version - ) - self._config_resolver = config_resolver or CatalogConfigResolver() # Kept to avoid breaking changes if not config_resolver: self._config_resolver._dataset_patterns = dataset_patterns or {} self._config_resolver._default_pattern = default_pattern or {} + self._load_versions, self._save_version = _validate_versions( + datasets, load_versions or {}, save_version + ) + self._datasets: dict[str, AbstractDataset] = {} self.datasets: _FrozenDatasets | None = None self.add_all(datasets or {}) - self._load_versions = load_versions or {} - self._save_version = save_version - self._use_rich_markup = _has_rich_handler() if feed_dict: diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 956e8fef33..9555cf1f69 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -96,15 +96,12 @@ def __init__( >>> save_args={"index": False}) >>> catalog = KedroDataCatalog(datasets={"cars": cars}) """ - load_versions, save_version = _validate_versions( - datasets, load_versions, save_version - ) - self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} self._lazy_datasets: dict[str, _LazyDataset] = {} - self._load_versions = load_versions or {} - self._save_version = save_version + self._load_versions, self._save_version = _validate_versions( + datasets, load_versions or {}, save_version + ) self._use_rich_markup = _has_rich_handler() From 06e343b34ffa17a9f6f13995903e11c188d98d06 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 11:34:15 +0000 Subject: [PATCH 44/78] Implemented unit tests for KedroDataCatalog Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 11 +++++ tests/io/test_kedro_data_catalog.py | 64 +++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 9abce4c83e..b01f11e2d0 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -3,6 +3,8 @@ import pytest from kedro_datasets.pandas import CSVDataset +from kedro.io import Version + @pytest.fixture def dummy_numpy_array(): @@ -34,6 +36,15 @@ def dataset(filepath): return CSVDataset(filepath=filepath, save_args={"index": False}) +@pytest.fixture +def dataset_versioned(filepath): + return CSVDataset( + filepath=filepath, + save_args={"index": False}, + version=Version(load="test_load_version.csv", save="test_save_version.csv"), + ) + + @pytest.fixture def correct_config(filepath): return { diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 367580ef80..ef7d6b6a58 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -20,6 +20,8 @@ from kedro.io.core import ( _DEFAULT_PACKAGES, VERSION_FORMAT, + Version, + VersionAlreadyExistsError, generate_timestamp, parse_dataset_definition, ) @@ -667,3 +669,65 @@ def test_load_version_on_unversioned_dataset( with pytest.raises(DatasetError): catalog.load("boats", version="first") + + def test_redefine_save_version_via_catalog( + self, correct_config, dataset_versioned + ): + """Test redefining save version when it is already set""" + # Version is set automatically for the catalog + catalog = KedroDataCatalog.from_config(**correct_config) + with pytest.raises(VersionAlreadyExistsError): + catalog["ds_versioned"] = dataset_versioned + + # Version is set manually for the catalog + correct_config["catalog"]["boats"]["versioned"] = True + catalog = KedroDataCatalog.from_config(**correct_config) + with pytest.raises(VersionAlreadyExistsError): + catalog["ds_versioned"] = dataset_versioned + + def test_set_load_and_save_versions(self, correct_config, dataset_versioned): + """Test setting load and save versions for catalog based on dataset's versions provided""" + catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) + + assert ( + catalog._load_versions["ds_versioned"] + == dataset_versioned._version.load + ) + assert catalog._save_version == dataset_versioned._version.save + + def test_set_same_versions(self, correct_config, dataset_versioned): + """Test setting the same load and save versions for catalog based on dataset's versions provided""" + catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) + catalog["ds_same_versions"] = dataset_versioned + + assert ( + catalog._load_versions["ds_versioned"] + == dataset_versioned._version.load + ) + assert catalog._save_version == dataset_versioned._version.save + + def test_redefine_load_version(self, correct_config, dataset_versioned): + """Test redefining save version when it is already set""" + catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) + dataset_versioned._version = Version( + load="another_load_version.csv", # load exact version + save="test_save_version.csv", # save to exact version + ) + catalog["ds_same_versions"] = dataset_versioned + + assert ( + catalog._load_versions["ds_same_versions"] + == dataset_versioned._version.load + ) + assert catalog._load_versions["ds_versioned"] == "test_load_version.csv" + assert catalog._save_version == dataset_versioned._version.save + + def test_redefine_save_version(self, correct_config, dataset_versioned): + """Test redefining save version when it is already set""" + catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) + dataset_versioned._version = Version( + load="another_load_version.csv", # load exact version + save="another_save_version.csv", # save to exact version + ) + with pytest.raises(VersionAlreadyExistsError): + catalog["ds_same_versions"] = dataset_versioned From 5492b9f5d8e9a0f66dedb33ece3829708d823d4a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 11:35:24 +0000 Subject: [PATCH 45/78] Removed odd comments Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index ef7d6b6a58..044700362b 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -710,8 +710,8 @@ def test_redefine_load_version(self, correct_config, dataset_versioned): """Test redefining save version when it is already set""" catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) dataset_versioned._version = Version( - load="another_load_version.csv", # load exact version - save="test_save_version.csv", # save to exact version + load="another_load_version.csv", + save="test_save_version.csv", ) catalog["ds_same_versions"] = dataset_versioned @@ -726,8 +726,8 @@ def test_redefine_save_version(self, correct_config, dataset_versioned): """Test redefining save version when it is already set""" catalog = KedroDataCatalog(datasets={"ds_versioned": dataset_versioned}) dataset_versioned._version = Version( - load="another_load_version.csv", # load exact version - save="another_save_version.csv", # save to exact version + load="another_load_version.csv", + save="another_save_version.csv", ) with pytest.raises(VersionAlreadyExistsError): catalog["ds_same_versions"] = dataset_versioned From c96546cb14357c15cae637bad730509e2303cb80 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 11:52:00 +0000 Subject: [PATCH 46/78] Implemented tests for DataCatalog Signed-off-by: Elena Khaustova --- tests/io/test_data_catalog.py | 55 +++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 386c0812db..ebe5ce6164 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -23,6 +23,7 @@ _DEFAULT_PACKAGES, VERSION_FORMAT, Version, + VersionAlreadyExistsError, generate_timestamp, parse_dataset_definition, ) @@ -753,6 +754,60 @@ def test_no_versions_with_cloud_protocol(self, monkeypatch): with pytest.raises(DatasetError, match=pattern): versioned_dataset.load() + def test_redefine_save_version_via_catalog(self, correct_config, dataset_versioned): + """Test redefining save version when it is already set""" + # Version is set automatically for the catalog + catalog = DataCatalog.from_config(**correct_config) + with pytest.raises(VersionAlreadyExistsError): + catalog.add("ds_versioned", dataset_versioned) + + # Version is set manually for the catalog + correct_config["catalog"]["boats"]["versioned"] = True + catalog = DataCatalog.from_config(**correct_config) + with pytest.raises(VersionAlreadyExistsError): + catalog.add("ds_versioned", dataset_versioned) + + def test_set_load_and_save_versions(self, correct_config, dataset_versioned): + """Test setting load and save versions for catalog based on dataset's versions provided""" + catalog = DataCatalog(datasets={"ds_versioned": dataset_versioned}) + + assert catalog._load_versions["ds_versioned"] == dataset_versioned._version.load + assert catalog._save_version == dataset_versioned._version.save + + def test_set_same_versions(self, correct_config, dataset_versioned): + """Test setting the same load and save versions for catalog based on dataset's versions provided""" + catalog = DataCatalog(datasets={"ds_versioned": dataset_versioned}) + catalog.add("ds_same_versions", dataset_versioned) + + assert catalog._load_versions["ds_versioned"] == dataset_versioned._version.load + assert catalog._save_version == dataset_versioned._version.save + + def test_redefine_load_version(self, correct_config, dataset_versioned): + """Test redefining save version when it is already set""" + catalog = DataCatalog(datasets={"ds_versioned": dataset_versioned}) + dataset_versioned._version = Version( + load="another_load_version.csv", + save="test_save_version.csv", + ) + catalog.add("ds_same_versions", dataset_versioned) + + assert ( + catalog._load_versions["ds_same_versions"] + == dataset_versioned._version.load + ) + assert catalog._load_versions["ds_versioned"] == "test_load_version.csv" + assert catalog._save_version == dataset_versioned._version.save + + def test_redefine_save_version(self, correct_config, dataset_versioned): + """Test redefining save version when it is already set""" + catalog = DataCatalog(datasets={"ds_versioned": dataset_versioned}) + dataset_versioned._version = Version( + load="another_load_version.csv", + save="another_save_version.csv", + ) + with pytest.raises(VersionAlreadyExistsError): + catalog.add("ds_same_versions", dataset_versioned) + class TestDataCatalogDatasetFactories: def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): From 46f2df621f252e3eab540ff121ebf7742fe16705 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 12:02:52 +0000 Subject: [PATCH 47/78] Added docstrings Signed-off-by: Elena Khaustova --- kedro/io/core.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index d9ff64239c..7cb1ff5dbd 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -95,9 +95,9 @@ class VersionNotFoundError(DatasetError): class VersionAlreadyExistsError(DatasetError): - """``VersioIsAmbiguousError`` raised by ``DataCatalog`` and ``KedroDataCatalog`` - classes in case of trying to add a datasets to the catalog with a save version - different from the one set for catalog. + """``VersionAlreadyExistsError`` raised by ``DataCatalog`` and ``KedroDataCatalog`` + classes when attempting to add a dataset to a catalog with a save version + that conflicts with the save version already set for the catalog. """ pass @@ -971,6 +971,28 @@ def _validate_versions( load_versions: dict[str, str], save_version: str | None, ) -> tuple[dict[str, str], str | None]: + """Validates and synchronizes dataset versions for loading and saving. + + Insures consistency of dataset versions across a catalog, particularly + for versioned datasets. It updates load versions and validates that all + save versions are consistent. + + Args: + datasets: A dictionary mapping dataset names to their instances. + if None, no validation occurs. + load_versions: A mapping between dataset names and versions + to load. + save_version: Version string to be used for ``save`` operations + by all datasets with enabled versioning. + + Returns: + Updated ``load_versions`` with load versions specified in the ``datasets`` + and resolved ``save_version``. + + Raises: + VersionAlreadyExistsError: If a dataset's save version conflicts with + the catalog's save version. + """ if not datasets: return load_versions, save_version From 56a067c88819ed279905b1f3e27ee3330a924d38 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 12:05:25 +0000 Subject: [PATCH 48/78] Added release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.md b/RELEASE.md index 26be20d106..58857a77d7 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -11,6 +11,7 @@ ## Bug fixes and other changes * Added I/O support for Oracle Cloud Infrastructure (OCI) Object Storage filesystem. * Fixed `DatasetAlreadyExistsError` for `ThreadRunner` when Kedro project run and using runner separately. +* Added validation to ensure dataset versions consistency across catalog. ## Breaking changes to the API ## Documentation changes From e9027b9c8876a0f84bf2a451b1b851a8335baf67 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 15:51:13 +0000 Subject: [PATCH 49/78] Updated version logic Signed-off-by: Elena Khaustova --- kedro/io/core.py | 19 +++++-------------- kedro/io/kedro_data_catalog.py | 21 ++++++--------------- 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 7db2ccda1e..fa20067f59 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -217,12 +217,10 @@ def from_config( ) from err return dataset - def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: + def to_config(self) -> dict[str, Any]: return_config: dict[str, Any] = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } - load_versions: dict[str, str] | None = None - save_version: str | None = None if self._init_args: return_config.update(self._init_args) @@ -233,23 +231,16 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: if isinstance(cached_ds, dict): cached_ds_return_config = cached_ds elif isinstance(cached_ds, AbstractDataset): - cached_ds_return_config, load_versions, save_version = ( - cached_ds.to_config() - ) + cached_ds_return_config = cached_ds.to_config() if VERSIONED_FLAG_KEY in cached_ds_return_config: return_config[VERSIONED_FLAG_KEY] = cached_ds_return_config.pop( VERSIONED_FLAG_KEY ) return_config["dataset"] = cached_ds_return_config - version = return_config.pop(VERSION_KEY, None) - - if version: + # Set `versioned` key if version present in the dataset + if return_config.pop(VERSION_KEY, None): return_config[VERSIONED_FLAG_KEY] = True - load_versions, save_version = ( - load_versions or version.load, - save_version or version.save, - ) # Pop data from configuration if type(self).__name__ == "MemoryDataset": @@ -258,7 +249,7 @@ def to_config(self) -> tuple[dict[str, Any], dict[str, str] | None, str | None]: # Pop metadata from configuration return_config.pop("metadata", None) - return return_config, load_versions, save_version + return return_config @property def _logger(self) -> logging.Logger: diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index e32aa032d4..a259d5bcef 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -18,7 +18,6 @@ from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( - VERSIONED_FLAG_KEY, AbstractDataset, AbstractVersionedDataset, CatalogProtocol, @@ -377,12 +376,11 @@ def to_config( dict[str, dict[str, Any]], dict[str, dict[str, Any]], dict[str, str | None], - dict[str, str | None], + str | None, ]: catalog: dict[str, dict[str, Any]] = {} credentials: dict[str, dict[str, Any]] = {} - load_version: dict[str, str | None] = {} - save_version: dict[str, str | None] = {} + load_versions: dict[str, str | None] = {} for ds_name, ds in self._lazy_datasets.items(): if _is_parameter(ds_name): @@ -392,27 +390,20 @@ def to_config( ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) - # TODO: Update when #4327 resolved - if catalog[ds_name].get(VERSIONED_FLAG_KEY, None): - load_version[ds_name] = ds.load_version - save_version[ds_name] = ds.save_version - else: - load_version[ds_name] = None - save_version[ds_name] = None + load_versions[ds_name] = self._load_versions.get(ds_name, None) for ds_name, ds in self._datasets.items(): # type: ignore[assignment] if _is_parameter(ds_name): continue - resolved_config, cur_load_versions, cur_save_version = ds.to_config() # type: ignore[attr-defined] + resolved_config = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_credentials(ds_name, resolved_config) ) catalog[ds_name] = unresolved_config credentials.update(unresolved_credentials) - load_version[ds_name] = cur_load_versions - save_version[ds_name] = cur_save_version + load_versions[ds_name] = self._load_versions.get(ds_name, None) - return catalog, credentials, load_version, save_version + return catalog, credentials, load_versions, self._save_version @staticmethod def _validate_dataset_config(ds_name: str, ds_config: Any) -> None: From 11b148be0092f12d7d19a23ac45712ce49051e60 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 22 Nov 2024 18:06:51 +0000 Subject: [PATCH 50/78] Added CachedDataset case Signed-off-by: Elena Khaustova --- kedro/io/core.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 7cb1ff5dbd..2b0c09a0e1 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -996,17 +996,22 @@ def _validate_versions( if not datasets: return load_versions, save_version + cur_load_versions = load_versions.copy() cur_save_version = save_version - cur_load_versions = load_versions + for ds_name, ds in datasets.items(): - if isinstance(ds, AbstractVersionedDataset) and ds._version: - if ds._version.load: - cur_load_versions[ds_name] = ds._version.load - if ds._version.save: - cur_save_version = cur_save_version or ds._version.save - if cur_save_version != ds._version.save: + # TODO: Move to kedro/io/kedro_data_catalog.py when removing DataCatalog + # TODO: Replace with isinstance(ds, CachedDataset) - current implementation to avoid circular import + cur_ds = ds._dataset if ds.__class__.__name__ == "CachedDataset" else ds # type: ignore[attr-defined] + + if isinstance(cur_ds, AbstractVersionedDataset) and cur_ds._version: + if cur_ds._version.load: + cur_load_versions[ds_name] = cur_ds._version.load + if cur_ds._version.save: + cur_save_version = cur_save_version or cur_ds._version.save + if cur_save_version != cur_ds._version.save: raise VersionAlreadyExistsError( - f"Cannot add a dataset `{ds_name}` with `{ds._version.save}` save version. " + f"Cannot add a dataset `{ds_name}` with `{cur_ds._version.save}` save version. " f"Save version set for the catalog is `{cur_save_version}`" f"All datasets in the catalog must have the same Save version." ) From ca2ac6c873368351d8ae4a8d45c5bf09454cd060 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 10:13:10 +0000 Subject: [PATCH 51/78] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 2 +- kedro/io/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index a4304d2cc3..94fa345843 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,6 +2,7 @@ ## Major features and improvements ## Bug fixes and other changes +* Added validation to ensure dataset versions consistency across catalog. ## Breaking changes to the API ## Documentation changes ## Community contributions @@ -19,7 +20,6 @@ ## Bug fixes and other changes * Added I/O support for Oracle Cloud Infrastructure (OCI) Object Storage filesystem. * Fixed `DatasetAlreadyExistsError` for `ThreadRunner` when Kedro project run and using runner separately. -* Added validation to ensure dataset versions consistency across catalog. ## Breaking changes to the API ## Documentation changes diff --git a/kedro/io/core.py b/kedro/io/core.py index 2b0c09a0e1..28ed6b164e 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1001,7 +1001,7 @@ def _validate_versions( for ds_name, ds in datasets.items(): # TODO: Move to kedro/io/kedro_data_catalog.py when removing DataCatalog - # TODO: Replace with isinstance(ds, CachedDataset) - current implementation to avoid circular import + # TODO: Replace with isinstance(ds, CachedDataset) - current implementation avoids circular import cur_ds = ds._dataset if ds.__class__.__name__ == "CachedDataset" else ds # type: ignore[attr-defined] if isinstance(cur_ds, AbstractVersionedDataset) and cur_ds._version: From 615d1356a48c0bb2220b95e43c5672e3f6a5eade Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 10:57:49 +0000 Subject: [PATCH 52/78] Added tests for CachedDataset use case Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 13 ++++++++++++- tests/io/test_data_catalog.py | 22 ++++++++++++++++++++++ tests/io/test_kedro_data_catalog.py | 22 ++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index b01f11e2d0..ce466469dd 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -3,7 +3,7 @@ import pytest from kedro_datasets.pandas import CSVDataset -from kedro.io import Version +from kedro.io import CachedDataset, Version @pytest.fixture @@ -45,6 +45,17 @@ def dataset_versioned(filepath): ) +@pytest.fixture +def cached_dataset_versioned(filepath): + return CachedDataset( + dataset=CSVDataset( + filepath=filepath, + save_args={"index": False}, + version=Version(load="test_load_version.csv", save="test_save_version.csv"), + ) + ) + + @pytest.fixture def correct_config(filepath): return { diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index ebe5ce6164..180919e309 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -808,6 +808,28 @@ def test_redefine_save_version(self, correct_config, dataset_versioned): with pytest.raises(VersionAlreadyExistsError): catalog.add("ds_same_versions", dataset_versioned) + def test_redefine_save_version_with_cached_dataset( + self, correct_config, cached_dataset_versioned + ): + """Test redefining load and save version with CachedDataset""" + catalog = DataCatalog.from_config(**correct_config) + + # Redefining save version fails + with pytest.raises(VersionAlreadyExistsError): + catalog.add("cached_dataset_versioned", cached_dataset_versioned) + + # Redefining load version passes + cached_dataset_versioned._dataset._version = Version( + load="test_load_version.csv", save=None + ) + catalog.add("cached_dataset_versioned", cached_dataset_versioned) + + assert ( + catalog._load_versions["cached_dataset_versioned"] + == "test_load_version.csv" + ) + assert catalog._save_version + class TestDataCatalogDatasetFactories: def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 044700362b..efd5a8a68e 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -731,3 +731,25 @@ def test_redefine_save_version(self, correct_config, dataset_versioned): ) with pytest.raises(VersionAlreadyExistsError): catalog["ds_same_versions"] = dataset_versioned + + def test_redefine_save_version_with_cached_dataset( + self, correct_config, cached_dataset_versioned + ): + """Test redefining load and save version with CachedDataset""" + catalog = KedroDataCatalog.from_config(**correct_config) + + # Redefining save version fails + with pytest.raises(VersionAlreadyExistsError): + catalog["cached_dataset_versioned"] = cached_dataset_versioned + + # Redefining load version passes + cached_dataset_versioned._dataset._version = Version( + load="test_load_version.csv", save=None + ) + catalog["cached_dataset_versioned"] = cached_dataset_versioned + + assert ( + catalog._load_versions["cached_dataset_versioned"] + == "test_load_version.csv" + ) + assert catalog._save_version From 8a01881734e33bd88b1425b9d7be8e983d3d309f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 11:40:59 +0000 Subject: [PATCH 53/78] Updated unit test after version validation is applied Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 1bc4854a62..1d25b39a92 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -309,7 +309,7 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): version = Version( load="fake_load_version.csv", # load exact version - save="fake_save_version.csv", # save to exact version + save=None, # save to exact version ) versioned_dataset = CSVDataset( filepath="shuttles.csv", version=version, metadata=[1, 2, 3] @@ -351,10 +351,14 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): expected_config.update(config) expected_config.pop("parameters", None) - # TODO: Add expected load/save versions when #4327 resolved - assert catalog_config == expected_config assert catalog_credentials == credentials + # Load version is set only for cached_versioned_dataset + assert catalog._load_versions == { + "cached_versioned_dataset": "fake_load_version.csv" + } + # Save version is not None and set to default + assert catalog._save_version class TestKedroDataCatalogFromConfig: def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): From eb44a30fab93baf6739b722e9c0450218e9b3b5d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 12:21:51 +0000 Subject: [PATCH 54/78] Removed MemoryDatasets Signed-off-by: Elena Khaustova --- kedro/io/core.py | 14 ++++++++++---- kedro/io/kedro_data_catalog.py | 6 +++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 81e3ee2ff8..4bca41e0e2 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -242,10 +242,6 @@ def to_config(self) -> dict[str, Any]: if return_config.pop(VERSION_KEY, None): return_config[VERSIONED_FLAG_KEY] = True - # Pop data from configuration - if type(self).__name__ == "MemoryDataset": - return_config.pop("data", None) - # Pop metadata from configuration return_config.pop("metadata", None) @@ -1078,3 +1074,13 @@ def _validate_versions( ) return cur_load_versions, cur_save_version + + +def _is_memory_dataset(ds_or_type: str | AbstractDataset) -> bool: + """Check if dataset or str type provided is a MemoryDataset.""" + if isinstance(ds_or_type, AbstractDataset): + return ds_or_type.__class__.__name__ == "MemoryDataset" + if isinstance(ds_or_type, str): + return ds_or_type == "MemoryDataset" + + return False diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index a259d5bcef..73da8d6b63 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -25,7 +25,7 @@ DatasetError, DatasetNotFoundError, Version, - _is_parameter, + _is_memory_dataset, _validate_versions, generate_timestamp, ) @@ -383,7 +383,7 @@ def to_config( load_versions: dict[str, str | None] = {} for ds_name, ds in self._lazy_datasets.items(): - if _is_parameter(ds_name): + if _is_memory_dataset(ds.config.get("type", "")): continue unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_credentials(ds_name, ds.config) @@ -393,7 +393,7 @@ def to_config( load_versions[ds_name] = self._load_versions.get(ds_name, None) for ds_name, ds in self._datasets.items(): # type: ignore[assignment] - if _is_parameter(ds_name): + if _is_memory_dataset(ds): continue resolved_config = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( From ba3d04e390428a103b73c659bea79fdda586e292 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 12:24:01 +0000 Subject: [PATCH 55/78] Removed _is_parameter Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 2 +- kedro/io/core.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index af4fa7a6ea..99350dc01c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -28,7 +28,7 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession: def is_parameter(dataset_name: str) -> bool: - # TODO: when breaking change replace with is_parameter from kedro/io/core.py + # TODO: when breaking change move it to kedro/io/core.py """Check if dataset is a parameter.""" return dataset_name.startswith("params:") or dataset_name == "parameters" diff --git a/kedro/io/core.py b/kedro/io/core.py index 4bca41e0e2..518e60b7e1 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1017,12 +1017,6 @@ def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: ... -def _is_parameter(dataset_name: str) -> bool: - # TODO: when breaking change replace with is_parameter and remove is_parameter from kedro/framework/cli/catalog.py - """Check if dataset is a parameter.""" - return dataset_name.startswith("params:") or dataset_name == "parameters" - - def _validate_versions( datasets: dict[str, AbstractDataset] | None, load_versions: dict[str, str], From 35953a9f6cbe5d3f8f7271f1fa3b7f614c70197a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 12:26:15 +0000 Subject: [PATCH 56/78] Pop metadata from cached dataset configuration Signed-off-by: Elena Khaustova --- kedro/io/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 518e60b7e1..a3c6f80885 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -236,6 +236,8 @@ def to_config(self) -> dict[str, Any]: return_config[VERSIONED_FLAG_KEY] = cached_ds_return_config.pop( VERSIONED_FLAG_KEY ) + # Pop metadata from configuration + cached_ds_return_config.pop("metadata", None) return_config["dataset"] = cached_ds_return_config # Set `versioned` key if version present in the dataset @@ -1070,7 +1072,7 @@ def _validate_versions( return cur_load_versions, cur_save_version -def _is_memory_dataset(ds_or_type: str | AbstractDataset) -> bool: +def _is_memory_dataset(ds_or_type: AbstractDataset | str) -> bool: """Check if dataset or str type provided is a MemoryDataset.""" if isinstance(ds_or_type, AbstractDataset): return ds_or_type.__class__.__name__ == "MemoryDataset" From d56793b70b308ab7d4955bb7f91dd4eac701d1c3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 12:31:14 +0000 Subject: [PATCH 57/78] Fixed lint Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 73da8d6b63..42e72f21d4 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -393,7 +393,7 @@ def to_config( load_versions[ds_name] = self._load_versions.get(ds_name, None) for ds_name, ds in self._datasets.items(): # type: ignore[assignment] - if _is_memory_dataset(ds): + if _is_memory_dataset(ds): # type: ignore[arg-type] continue resolved_config = ds.to_config() # type: ignore[attr-defined] unresolved_config, unresolved_credentials = ( From ebf148333e2326339f08eac92431aa2c06277d41 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 14:21:00 +0000 Subject: [PATCH 58/78] Fixed unit test Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- tests/io/test_core.py | 14 ++++++++++++++ tests/io/test_kedro_data_catalog.py | 4 ---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index a3c6f80885..1c0bd9ce33 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1077,6 +1077,6 @@ def _is_memory_dataset(ds_or_type: AbstractDataset | str) -> bool: if isinstance(ds_or_type, AbstractDataset): return ds_or_type.__class__.__name__ == "MemoryDataset" if isinstance(ds_or_type, str): - return ds_or_type == "MemoryDataset" + return ds_or_type in {"MemoryDataset", "kedro.io.memory_dataset.MemoryDataset"} return False diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 7c30652b6b..388f3c6d3f 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -18,6 +18,7 @@ DatasetError, Version, VersionNotFoundError, + _is_memory_dataset, generate_timestamp, get_filepath_str, get_protocol_and_path, @@ -593,3 +594,16 @@ def test_versioning_existing_dataset( Path(my_legacy_dataset._filepath.as_posix()).unlink() my_legacy_versioned_dataset.save(dummy_data) assert my_legacy_versioned_dataset.exists() + + +@pytest.mark.parametrize( + "ds_or_type,expected_result", + [ + ("MemoryDataset", True), + ("kedro.io.memory_dataset.MemoryDataset", True), + ("NotMemoryDataset", False), + (my_dataset, False), + ], +) +def test_is_memory_dataset(ds_or_type, expected_result): + assert _is_memory_dataset(ds_or_type) == expected_result diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 1d25b39a92..e6ffbf88aa 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -330,10 +330,6 @@ def test_to_config(self, correct_config_versioned, dataset, filepath): "credentials": None, "fs_args": None, }, - "memory_ds": { - "type": "kedro.io.memory_dataset.MemoryDataset", - "copy_mode": None, - }, "cached_versioned_dataset": { "type": "kedro.io.cached_dataset.CachedDataset", "copy_mode": None, From f5468c9f14ad81515bac1575f641bfe969f91539 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 15:55:01 +0000 Subject: [PATCH 59/78] Added docstrings for AbstractDataset.to_config() Signed-off-by: Elena Khaustova --- kedro/io/core.py | 64 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 1c0bd9ce33..6e002650ee 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -159,9 +159,21 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): need to change the `_EPHEMERAL` attribute to 'True'. """ _EPHEMERAL = False + # Declares a class-level attribute that will store the initialization + # arguments of an instance. Initially, it is set to None, but it will + # hold a dictionary of arguments after initialization. _init_args: dict[str, Any] | None = None def __post_init__(self, call_args: dict[str, Any]) -> None: + """Handles additional setup after the object is initialized. + + Stores the initialization arguments (excluding `self`) in the `_init_args` attribute. + + Args: + call_args: A dictionary of arguments passed to the `__init__` method, captured + using `inspect.getcallargs`. + """ + self._init_args = call_args self._init_args.pop("self", None) @@ -218,6 +230,32 @@ def from_config( return dataset def to_config(self) -> dict[str, Any]: + """Converts the dataset instance into a dictionary-based configuration for + serialization or reconstruction. + + Ensures that any subclass-specific details are handled, with + additional logic for versioning and caching implemented for `CachedDataset`. + + Functionality: + 1. Base Configuration: + - Adds a key for the dataset's type using its module and class name. + - Includes the initialization arguments (`_init_args`) if available. + + 2. Special Handling for `CachedDataset`: + - Extracts the underlying dataset's configuration. + - Handles the `versioned` flag and removes unnecessary metadata. + - Ensures the embedded dataset's configuration is appropriately flattened + or transformed. + + 3. Versioning: + - If the dataset has a version key, sets the `versioned` flag in the configuration. + + 4. Metadata Removal: + - Removes the `metadata` key from the configuration if present. + + Returns: + A dictionary containing the dataset's type and initialization arguments. + """ return_config: dict[str, Any] = { f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } @@ -329,23 +367,47 @@ def save(self: Self, data: _DI) -> None: return save def __init_subclass__(cls, **kwargs: Any) -> None: - """Decorate the `load` and `save` methods provided by the class. + """Customizes the behavior of subclasses of AbstractDataset during + their creation. This method is automatically invoked when a subclass + of AbstractDataset is defined. + + Decorate the `load` and `save` methods provided by the class. If `_load` or `_save` are defined, alias them as a prerequisite. """ + # Save the original __init__ method of the subclass init_func: Callable = cls.__init__ def init_decorator(previous_init: Callable) -> Callable: + """A decorator that wraps the original __init__ of the subclass. + + It ensures that after the original __init__ executes, the `__post_init__` + method of the instance is called with the arguments used to initialize + the object. + """ + def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] + """The decorated __init__ method. + + Executes the original __init__, then calls __post_init__ with the arguments + used to initialize the instance. + """ + + # Call the original __init__ method previous_init(self, *args, **kwargs) if type(self) is cls: + # Capture and process the arguments passed to the original __init__ call_args = getcallargs(init_func, self, *args, **kwargs) + # Call the custom post-initialization method to save captured arguments self.__post_init__(call_args) return new_init + # Replace the subclass's __init__ with the decorated version + # A hook for subclasses to capture initialization arguments and save them + # in the AbstractDataset._init_args field cls.__init__ = init_decorator(cls.__init__) # type: ignore[method-assign] super().__init_subclass__(**kwargs) From edee597ef1e0292fe99af78ff0950844c29013a1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 17:21:06 +0000 Subject: [PATCH 60/78] Updated docstrings Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 6e002650ee..aeae140fb5 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -372,7 +372,7 @@ def __init_subclass__(cls, **kwargs: Any) -> None: of AbstractDataset is defined. - Decorate the `load` and `save` methods provided by the class. + Decorates the `load` and `save` methods provided by the class. If `_load` or `_save` are defined, alias them as a prerequisite. """ From 445497030b3cf7a871ff968967c9298a09523409 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 17:25:26 +0000 Subject: [PATCH 61/78] Fixed typos Signed-off-by: Elena Khaustova --- kedro/io/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 28ed6b164e..0591088f69 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -971,9 +971,9 @@ def _validate_versions( load_versions: dict[str, str], save_version: str | None, ) -> tuple[dict[str, str], str | None]: - """Validates and synchronizes dataset versions for loading and saving. + """Validates and synchronises dataset versions for loading and saving. - Insures consistency of dataset versions across a catalog, particularly + Ensures consistency of dataset versions across a catalog, particularly for versioned datasets. It updates load versions and validates that all save versions are consistent. @@ -983,7 +983,7 @@ def _validate_versions( load_versions: A mapping between dataset names and versions to load. save_version: Version string to be used for ``save`` operations - by all datasets with enabled versioning. + by all datasets with versioning enabled. Returns: Updated ``load_versions`` with load versions specified in the ``datasets`` @@ -1013,7 +1013,7 @@ def _validate_versions( raise VersionAlreadyExistsError( f"Cannot add a dataset `{ds_name}` with `{cur_ds._version.save}` save version. " f"Save version set for the catalog is `{cur_save_version}`" - f"All datasets in the catalog must have the same Save version." + f"All datasets in the catalog must have the same save version." ) return cur_load_versions, cur_save_version From 5d6bd3c7590e706954db6a7fdfc11ef4e29ebe9e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 17:26:43 +0000 Subject: [PATCH 62/78] Updated TODOs Signed-off-by: Elena Khaustova --- kedro/io/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/io/core.py b/kedro/io/core.py index 0591088f69..c83e77c7a6 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1001,6 +1001,7 @@ def _validate_versions( for ds_name, ds in datasets.items(): # TODO: Move to kedro/io/kedro_data_catalog.py when removing DataCatalog + # TODO: Make it a protected static method for KedroDataCatalog # TODO: Replace with isinstance(ds, CachedDataset) - current implementation avoids circular import cur_ds = ds._dataset if ds.__class__.__name__ == "CachedDataset" else ds # type: ignore[attr-defined] From 86e25e963fc7c51a1d9f4d84e648c9efdc0fd21c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:16:21 +0000 Subject: [PATCH 63/78] Added docstring for KedroDataCatalog.to_config Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- kedro/io/kedro_data_catalog.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 1247b8fcf9..741c94df7c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -231,7 +231,7 @@ def from_config( def to_config(self) -> dict[str, Any]: """Converts the dataset instance into a dictionary-based configuration for - serialization or reconstruction. + serialization. Ensures that any subclass-specific details are handled, with additional logic for versioning and caching implemented for `CachedDataset`. diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 42e72f21d4..ef65834940 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -378,6 +378,34 @@ def to_config( dict[str, str | None], str | None, ]: + """Converts the KedroDataCatalog instance into a configuration format suitable for + serialization. This includes datasets, credentials, and versioning information. + + Returns: + A tuple containing: + - catalog: A dictionary mapping dataset names to their unresolved configurations, + excluding in-memory datasets. + - credentials: A dictionary of unresolved credentials extracted from dataset configurations. + - load_versions: A dictionary mapping dataset names to specific versions to be loaded, + or `None` if no version is set. + - save_version: A global version identifier for saving datasets, or `None` if not specified. + Example: + :: + + >>> from kedro.io import KedroDataCatalog + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset( + >>> filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False} + >>> ) + >>> catalog = KedroDataCatalog(datasets={'cars': cars}) + >>> + >>> catalog, credentials, load_versions, save_version = data_catalog.to_config() + >>> + >>> new_catalog = KedroDataCatalog.from_config(config, credentials, load_versions, save_version) + """ catalog: dict[str, dict[str, Any]] = {} credentials: dict[str, dict[str, Any]] = {} load_versions: dict[str, str | None] = {} From c8fd99efd90d0d44a67f75d93f2d6d82b545f0da Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:26:25 +0000 Subject: [PATCH 64/78] Added docstrinbgs for unresolve_credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 19 +++++++++++++++++++ kedro/io/kedro_data_catalog.py | 8 ++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 41299f77d6..d4582d8e25 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -266,6 +266,25 @@ def resolve_credentials( def unresolve_credentials( cred_name: str, ds_config: dict[str, dict[str, Any]] | None ) -> tuple[dict[str, dict[str, Any]], dict[str, dict[str, Any]]]: + """Extracts and replaces credentials in a dataset configuration with + references, ensuring separation of credentials from the dataset configuration. + + Credentials are searched for recursively in the dataset configuration. + The first occurrence of the `CREDENTIALS_KEY` is replaced with a generated + reference key. + + Args: + cred_name: A unique identifier for the credentials being unresolved. + This is used to generate a reference key for the credentials. + ds_config: The dataset configuration containing potential credentials + under the key `CREDENTIALS_KEY`. + + Returns: + A tuple containing: + ds_config_copy : A deep copy of the original dataset + configuration with credentials replaced by reference keys. + credentials: A dictionary mapping generated reference keys to the original credentials. + """ ds_config_copy = copy.deepcopy(ds_config) or {} credentials: dict[str, Any] = {} credentials_ref = f"{cred_name}_{CREDENTIALS_KEY}" diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index ef65834940..8f05baa49b 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -383,12 +383,12 @@ def to_config( Returns: A tuple containing: - - catalog: A dictionary mapping dataset names to their unresolved configurations, + catalog: A dictionary mapping dataset names to their unresolved configurations, excluding in-memory datasets. - - credentials: A dictionary of unresolved credentials extracted from dataset configurations. - - load_versions: A dictionary mapping dataset names to specific versions to be loaded, + credentials: A dictionary of unresolved credentials extracted from dataset configurations. + load_versions: A dictionary mapping dataset names to specific versions to be loaded, or `None` if no version is set. - - save_version: A global version identifier for saving datasets, or `None` if not specified. + save_version: A global version identifier for saving datasets, or `None` if not specified. Example: :: From 5b2f21fac53cba1e0e69c1fd2738af052806fb8f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:30:00 +0000 Subject: [PATCH 65/78] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 94fa345843..0cc0fdf013 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,8 +1,11 @@ # Upcoming Release ## Major features and improvements +* Implemented `KedroDataCatalog.to_config()` method that converts the catalog instance into a configuration format suitable for serialization. + ## Bug fixes and other changes * Added validation to ensure dataset versions consistency across catalog. + ## Breaking changes to the API ## Documentation changes ## Community contributions From 35dc1027f6db08ffcf07e9acf087e9089ffa8bca Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:34:13 +0000 Subject: [PATCH 66/78] Fixed indentation Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 741c94df7c..b012345900 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -253,7 +253,7 @@ def to_config(self) -> dict[str, Any]: 4. Metadata Removal: - Removes the `metadata` key from the configuration if present. - Returns: + Returns: A dictionary containing the dataset's type and initialization arguments. """ return_config: dict[str, Any] = { From 2853fda918d13bdf2528ffc1f52f9648faa3cd47 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:41:52 +0000 Subject: [PATCH 67/78] Fixed to_config() example Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 8f05baa49b..071efc5bca 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -402,7 +402,7 @@ def to_config( >>> ) >>> catalog = KedroDataCatalog(datasets={'cars': cars}) >>> - >>> catalog, credentials, load_versions, save_version = data_catalog.to_config() + >>> config, credentials, load_versions, save_version = catalog.to_config() >>> >>> new_catalog = KedroDataCatalog.from_config(config, credentials, load_versions, save_version) """ From 8f0fe4f9729cc03f8334e98b5fffeeaacefdb3d3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:43:53 +0000 Subject: [PATCH 68/78] Fixed indentation Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index b012345900..724a3a3e61 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -245,7 +245,7 @@ def to_config(self) -> dict[str, Any]: - Extracts the underlying dataset's configuration. - Handles the `versioned` flag and removes unnecessary metadata. - Ensures the embedded dataset's configuration is appropriately flattened - or transformed. + or transformed. 3. Versioning: - If the dataset has a version key, sets the `versioned` flag in the configuration. From 0db9b46011b461ed3fc09ad44dbde62220e05928 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 19:54:12 +0000 Subject: [PATCH 69/78] Fixed indentation Signed-off-by: Elena Khaustova --- kedro/io/core.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 724a3a3e61..784643f12c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -231,27 +231,21 @@ def from_config( def to_config(self) -> dict[str, Any]: """Converts the dataset instance into a dictionary-based configuration for - serialization. - - Ensures that any subclass-specific details are handled, with + serialization. Ensures that any subclass-specific details are handled, with additional logic for versioning and caching implemented for `CachedDataset`. - Functionality: - 1. Base Configuration: - - Adds a key for the dataset's type using its module and class name. - - Includes the initialization arguments (`_init_args`) if available. + Adds a key for the dataset's type using its module and class name and + includes the initialization arguments. - 2. Special Handling for `CachedDataset`: - - Extracts the underlying dataset's configuration. - - Handles the `versioned` flag and removes unnecessary metadata. - - Ensures the embedded dataset's configuration is appropriately flattened - or transformed. + For `CachedDataset` it extracts the underlying dataset's configuration, + handles the `versioned` flag and removes unnecessary metadata. It also + ensures the embedded dataset's configuration is appropriately flattened + or transformed. - 3. Versioning: - - If the dataset has a version key, sets the `versioned` flag in the configuration. + If the dataset has a version key, it sets the `versioned` flag in the + configuration. - 4. Metadata Removal: - - Removes the `metadata` key from the configuration if present. + Removes the `metadata` key from the configuration if present. Returns: A dictionary containing the dataset's type and initialization arguments. From 2f72e23db11866eddaec1d5645ba7caa93f1d2f8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 27 Nov 2024 20:54:18 +0000 Subject: [PATCH 70/78] Added a note about to_config() constraints Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 071efc5bca..6743dbf3fd 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -378,9 +378,14 @@ def to_config( dict[str, str | None], str | None, ]: - """Converts the KedroDataCatalog instance into a configuration format suitable for + """Converts the `KedroDataCatalog` instance into a configuration format suitable for serialization. This includes datasets, credentials, and versioning information. + This method is only applicabe to catalogs that contain datasets initialized with static, primitive + parameters. For example, it will work fine if one passes credentials as dictionary to + `GBQQueryDataset` but not as `google.auth.credentials.Credentials` object. See + https://github.com/kedro-org/kedro-plugins/issues/950 for the details. + Returns: A tuple containing: catalog: A dictionary mapping dataset names to their unresolved configurations, From a7689b97f1b30e234614b50c709745be1401d7b2 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 14:06:19 +0000 Subject: [PATCH 71/78] Fixed typo Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 6743dbf3fd..0f622f7d9e 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -381,7 +381,7 @@ def to_config( """Converts the `KedroDataCatalog` instance into a configuration format suitable for serialization. This includes datasets, credentials, and versioning information. - This method is only applicabe to catalogs that contain datasets initialized with static, primitive + This method is only applicable to catalogs that contain datasets initialized with static, primitive parameters. For example, it will work fine if one passes credentials as dictionary to `GBQQueryDataset` but not as `google.auth.credentials.Credentials` object. See https://github.com/kedro-org/kedro-plugins/issues/950 for the details. From 3c3664ebfd59f2aa81ee6f7e95a8460b08d564ee Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 14:13:45 +0000 Subject: [PATCH 72/78] Replace type string with the constant Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 0f622f7d9e..8975656774 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -16,6 +16,7 @@ import re from typing import Any, Iterator, List # noqa: UP035 +from build.lib.kedro.io.core import TYPE_KEY from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, @@ -416,7 +417,7 @@ def to_config( load_versions: dict[str, str | None] = {} for ds_name, ds in self._lazy_datasets.items(): - if _is_memory_dataset(ds.config.get("type", "")): + if _is_memory_dataset(ds.config.get(TYPE_KEY, "")): continue unresolved_config, unresolved_credentials = ( self._config_resolver.unresolve_credentials(ds_name, ds.config) From b7183abf01b93039c52efed0a01f3f57bcd3bad7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 14:14:40 +0000 Subject: [PATCH 73/78] Replace type string with the constant Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 8975656774..7f33e024f6 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -16,9 +16,9 @@ import re from typing import Any, Iterator, List # noqa: UP035 -from build.lib.kedro.io.core import TYPE_KEY from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( + TYPE_KEY, AbstractDataset, AbstractVersionedDataset, CatalogProtocol, From 171e80f287f3c8cafc612feedd49733ae3576d6a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 17:47:13 +0000 Subject: [PATCH 74/78] Moved _is_memory_dataset Signed-off-by: Elena Khaustova --- kedro/io/core.py | 10 ---------- kedro/io/kedro_data_catalog.py | 3 +-- kedro/io/memory_dataset.py | 10 ++++++++++ tests/io/test_core.py | 14 -------------- tests/io/test_memory_dataset.py | 16 ++++++++++++++++ 5 files changed, 27 insertions(+), 26 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 784643f12c..f90acd6468 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1127,13 +1127,3 @@ def _validate_versions( ) return cur_load_versions, cur_save_version - - -def _is_memory_dataset(ds_or_type: AbstractDataset | str) -> bool: - """Check if dataset or str type provided is a MemoryDataset.""" - if isinstance(ds_or_type, AbstractDataset): - return ds_or_type.__class__.__name__ == "MemoryDataset" - if isinstance(ds_or_type, str): - return ds_or_type in {"MemoryDataset", "kedro.io.memory_dataset.MemoryDataset"} - - return False diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 7f33e024f6..33128fd809 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -26,11 +26,10 @@ DatasetError, DatasetNotFoundError, Version, - _is_memory_dataset, _validate_versions, generate_timestamp, ) -from kedro.io.memory_dataset import MemoryDataset +from kedro.io.memory_dataset import MemoryDataset, _is_memory_dataset from kedro.utils import _format_rich, _has_rich_handler diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 1e8eef8452..2fdedf29b5 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -140,3 +140,13 @@ def _copy_with_mode(data: Any, copy_mode: str) -> Any: ) return copied_data + + +def _is_memory_dataset(ds_or_type: AbstractDataset | str) -> bool: + """Check if dataset or str type provided is a MemoryDataset.""" + if isinstance(ds_or_type, MemoryDataset): + return True + if isinstance(ds_or_type, str): + return ds_or_type in {"MemoryDataset", "kedro.io.memory_dataset.MemoryDataset"} + + return False diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 388f3c6d3f..7c30652b6b 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -18,7 +18,6 @@ DatasetError, Version, VersionNotFoundError, - _is_memory_dataset, generate_timestamp, get_filepath_str, get_protocol_and_path, @@ -594,16 +593,3 @@ def test_versioning_existing_dataset( Path(my_legacy_dataset._filepath.as_posix()).unlink() my_legacy_versioned_dataset.save(dummy_data) assert my_legacy_versioned_dataset.exists() - - -@pytest.mark.parametrize( - "ds_or_type,expected_result", - [ - ("MemoryDataset", True), - ("kedro.io.memory_dataset.MemoryDataset", True), - ("NotMemoryDataset", False), - (my_dataset, False), - ], -) -def test_is_memory_dataset(ds_or_type, expected_result): - assert _is_memory_dataset(ds_or_type) == expected_result diff --git a/tests/io/test_memory_dataset.py b/tests/io/test_memory_dataset.py index c2dbe56925..5a85400d66 100644 --- a/tests/io/test_memory_dataset.py +++ b/tests/io/test_memory_dataset.py @@ -3,11 +3,13 @@ import numpy as np import pandas as pd import pytest +from kedro_datasets.pandas import CSVDataset from kedro.io import DatasetError, MemoryDataset from kedro.io.memory_dataset import ( _copy_with_mode, _infer_copy_mode, + _is_memory_dataset, ) @@ -233,3 +235,17 @@ class DataFrame: data = DataFrame() copy_mode = _infer_copy_mode(data) assert copy_mode == "assign" + + +@pytest.mark.parametrize( + "ds_or_type,expected_result", + [ + ("MemoryDataset", True), + ("kedro.io.memory_dataset.MemoryDataset", True), + ("NotMemoryDataset", False), + (MemoryDataset(data=""), True), + (CSVDataset(filepath="abc.csv"), False), + ], +) +def test_is_memory_dataset(ds_or_type, expected_result): + assert _is_memory_dataset(ds_or_type) == expected_result From b789018a40d8bd2a1f3f9e86f62ccfe8aefdb0a1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 21:21:43 +0000 Subject: [PATCH 75/78] Simplified nested decorator Signed-off-by: Elena Khaustova --- kedro/io/core.py | 52 ++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 39 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f90acd6468..0ec1b1172a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -162,21 +162,10 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): # Declares a class-level attribute that will store the initialization # arguments of an instance. Initially, it is set to None, but it will # hold a dictionary of arguments after initialization. + # It is overridden in the __init_subclass__ and further used as an + # instance attribute _init_args: dict[str, Any] | None = None - def __post_init__(self, call_args: dict[str, Any]) -> None: - """Handles additional setup after the object is initialized. - - Stores the initialization arguments (excluding `self`) in the `_init_args` attribute. - - Args: - call_args: A dictionary of arguments passed to the `__init__` method, captured - using `inspect.getcallargs`. - """ - - self._init_args = call_args - self._init_args.pop("self", None) - @classmethod def from_config( cls: type, @@ -255,6 +244,7 @@ def to_config(self) -> dict[str, Any]: } if self._init_args: + self._init_args.pop("self", None) return_config.update(self._init_args) if type(self).__name__ == "CachedDataset": @@ -365,7 +355,6 @@ def __init_subclass__(cls, **kwargs: Any) -> None: their creation. This method is automatically invoked when a subclass of AbstractDataset is defined. - Decorates the `load` and `save` methods provided by the class. If `_load` or `_save` are defined, alias them as a prerequisite. @@ -374,35 +363,20 @@ def __init_subclass__(cls, **kwargs: Any) -> None: # Save the original __init__ method of the subclass init_func: Callable = cls.__init__ - def init_decorator(previous_init: Callable) -> Callable: - """A decorator that wraps the original __init__ of the subclass. - - It ensures that after the original __init__ executes, the `__post_init__` - method of the instance is called with the arguments used to initialize - the object. + def new_init(self, *args, **kwargs) -> None: + """Executes the original __init__, then save the arguments used + to initialize the instance. """ + # Call the original __init__ method + init_func(self, *args, **kwargs) + # Capture and save the arguments passed to the original __init__ + call_args = getcallargs(init_func, self, *args, **kwargs) + self._init_args = call_args - def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - """The decorated __init__ method. - - Executes the original __init__, then calls __post_init__ with the arguments - used to initialize the instance. - """ - - # Call the original __init__ method - previous_init(self, *args, **kwargs) - if type(self) is cls: - # Capture and process the arguments passed to the original __init__ - call_args = getcallargs(init_func, self, *args, **kwargs) - # Call the custom post-initialization method to save captured arguments - self.__post_init__(call_args) - - return new_init - - # Replace the subclass's __init__ with the decorated version + # Replace the subclass's __init__ with the new_init # A hook for subclasses to capture initialization arguments and save them # in the AbstractDataset._init_args field - cls.__init__ = init_decorator(cls.__init__) # type: ignore[method-assign] + cls.__init__ = new_init super().__init_subclass__(**kwargs) From 6ba6ee49e7532721dd23d798543ba92475c449de Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 28 Nov 2024 21:24:59 +0000 Subject: [PATCH 76/78] Fixed lint Signed-off-by: Elena Khaustova --- kedro/io/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 0ec1b1172a..256927ca68 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -363,7 +363,7 @@ def __init_subclass__(cls, **kwargs: Any) -> None: # Save the original __init__ method of the subclass init_func: Callable = cls.__init__ - def new_init(self, *args, **kwargs) -> None: + def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] """Executes the original __init__, then save the arguments used to initialize the instance. """ @@ -376,7 +376,7 @@ def new_init(self, *args, **kwargs) -> None: # Replace the subclass's __init__ with the new_init # A hook for subclasses to capture initialization arguments and save them # in the AbstractDataset._init_args field - cls.__init__ = new_init + cls.__init__ = new_init # type: ignore[method-assign] super().__init_subclass__(**kwargs) From 700834653a61c5de35eb72953e5b2d6bb644d556 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 29 Nov 2024 18:42:35 +0000 Subject: [PATCH 77/78] Removed _init_args class attribute Signed-off-by: Elena Khaustova --- kedro/io/core.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 256927ca68..cabd67bd28 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -159,12 +159,6 @@ class AbstractDataset(abc.ABC, Generic[_DI, _DO]): need to change the `_EPHEMERAL` attribute to 'True'. """ _EPHEMERAL = False - # Declares a class-level attribute that will store the initialization - # arguments of an instance. Initially, it is set to None, but it will - # hold a dictionary of arguments after initialization. - # It is overridden in the __init_subclass__ and further used as an - # instance attribute - _init_args: dict[str, Any] | None = None @classmethod def from_config( @@ -243,9 +237,9 @@ def to_config(self) -> dict[str, Any]: f"{TYPE_KEY}": f"{type(self).__module__}.{type(self).__name__}" } - if self._init_args: - self._init_args.pop("self", None) - return_config.update(self._init_args) + if self._init_args: # type: ignore[attr-defined] + self._init_args.pop("self", None) # type: ignore[attr-defined] + return_config.update(self._init_args) # type: ignore[attr-defined] if type(self).__name__ == "CachedDataset": cached_ds = return_config.pop("dataset") From 7af15a563e100d6b2251d3df883e7efd7ba1e6f0 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 29 Nov 2024 19:35:01 +0000 Subject: [PATCH 78/78] Returned @wraps Signed-off-by: Elena Khaustova --- kedro/io/core.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index cabd67bd28..1e518d5c7a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -351,12 +351,12 @@ def __init_subclass__(cls, **kwargs: Any) -> None: Decorates the `load` and `save` methods provided by the class. If `_load` or `_save` are defined, alias them as a prerequisite. - """ # Save the original __init__ method of the subclass init_func: Callable = cls.__init__ + @wraps(init_func) def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] """Executes the original __init__, then save the arguments used to initialize the instance. @@ -364,8 +364,7 @@ def new_init(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] # Call the original __init__ method init_func(self, *args, **kwargs) # Capture and save the arguments passed to the original __init__ - call_args = getcallargs(init_func, self, *args, **kwargs) - self._init_args = call_args + self._init_args = getcallargs(init_func, self, *args, **kwargs) # Replace the subclass's __init__ with the new_init # A hook for subclasses to capture initialization arguments and save them