From a8f4fb3d9423fc3c1e2021e67d9209fdc3d8b9db Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 31 Jul 2024 18:16:56 +0100 Subject: [PATCH 001/173] Added a skeleton for AbstractDataCatalog and KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 272 ++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 kedro/io/data_catalog_redesign.py diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py new file mode 100644 index 0000000000..ce5e3269d1 --- /dev/null +++ b/kedro/io/data_catalog_redesign.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import abc +import copy +import re +from typing import Any + +from parse import parse + +from kedro.io.core import AbstractDataset, DatasetError, DatasetNotFoundError, Version + +Patterns = dict[str, dict[str, Any]] + +CREDENTIALS_KEY = "credentials" + + +def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Return a set of credentials from the provided credentials dict. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + +def _resolve_credentials( + config: dict[str, Any], credentials: dict[str, Any] +) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _map_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return _get_credentials(value, credentials) + if isinstance(value, dict): + return {k: _map_value(k, v) for k, v in value.items()} + return value + + return {k: _map_value(k, v) for k, v in config.items()} + + +class AbstractDataCatalog: + datasets = None + + def __init__( + self, + datasets: dict[str, AbstractDataset] | None = None, + config: dict[str, dict[str, Any]] | None = None, + credentials: dict[str, dict[str, Any]] | None = None, + ) -> None: + self.config = config or {} + self.resolved_ds_configs = {} + self.datasets = datasets or {} + + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + + def __iter__(self): + yield from self.datasets.values() + + @staticmethod + def _is_pattern(pattern: str) -> bool: + """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" + return "{" in pattern + + @staticmethod + def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + matches = ( + pattern + for pattern in dataset_patterns.keys() + if parse(pattern, dataset_name) + ) + return next(matches, None) + + @staticmethod + def _specificity(pattern: str) -> int: + """Helper function to check the length of exactly matched characters not inside brackets. + + Example: + :: + + >>> specificity("{namespace}.companies") = 10 + >>> specificity("{namespace}.{dataset}") = 1 + >>> specificity("france.companies") = 16 + """ + # Remove all the placeholders from the pattern and count the number of remaining chars + result = re.sub(r"\{.*?\}", "", pattern) + return len(result) + + @classmethod + def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules. + + In order: + + 1. Decreasing specificity (number of characters outside the curly brackets) + 2. Decreasing number of placeholders (number of curly bracket pairs) + 3. Alphabetically + """ + sorted_keys = sorted( + dataset_patterns, + key=lambda pattern: ( + -(cls._specificity(pattern)), + -pattern.count("{"), + pattern, + ), + ) + catch_all = [ + pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + ] + if len(catch_all) > 1: + raise DatasetError( + f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." + ) + return {key: dataset_patterns[key] for key in sorted_keys} + + @classmethod + def _get_patterns( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> tuple[Patterns, Patterns]: + dataset_patterns = {} + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + user_default = {} + + for ds_name, ds_config in config.items(): + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + if cls._is_pattern(ds_name): + dataset_patterns[ds_name] = resolved_ds_config + else: + cls.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + load_versions.get(ds_name), + save_version, + ) + + sorted_patterns = cls._sort_patterns(dataset_patterns) + if sorted_patterns: + # If the last pattern is a catch-all pattern, pop it and set it as the default + if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + last_pattern = sorted_patterns.popitem() + user_default = {last_pattern[0]: last_pattern[1]} + + missing_keys = [ + key + for key in load_versions.keys() + if not (key in config or cls._match_pattern(sorted_patterns, key)) + ] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) + + return sorted_patterns, user_default + + @classmethod + def _resolve_config( + cls, dataset_name: str, matched_pattern: str, config: dict + ) -> dict[str, Any]: + # get resolved dataset config + pass + + def resolve_patterns(self, datasets: str | list[str], **kwargs): + # Logic to resolve patterns and extend self.datasets with resolved names + # and self.resolved_config with resolved config + pass + + @classmethod + def from_config( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None = None, + **kwargs, + ) -> AbstractDataCatalog: + # Create a data catalog from configuration. + pass + + @abc.abstractmethod + def get_dataset(self, dataset_name: str, **kwargs) -> Any: + self.resolve_patterns(dataset_name, **kwargs) + # Specific dataset type logic + + @abc.abstractmethod + def get_dataset_config(self, dataset_name: str) -> dict: + # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns + pass + + +class KedroDataCatalog(AbstractDataCatalog): + def __init__( + self, + datasets: dict[str, AbstractDataset] | None = None, + config: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> None: + super().__init__(datasets, config) + + self._load_versions = load_versions or {} + self._save_version = save_version + + @classmethod + def from_config( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> KedroDataCatalog: + pass + + def resolve_patterns( + self, + datasets: str | list[str], + version: Version | None = None, + suggest: bool = True, + ) -> None: + super().resolve_patterns(datasets) + # KedroDataCatalog related logic + + def get_dataset(self, dataset_name: str, **kwargs) -> AbstractDataset: + super().get_dataset(dataset_name, **kwargs) + dataset = self.datasets[dataset_name] + # Version related logic + return dataset + + def get_dataset_config(self, dataset_name: str) -> dict: + # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns + pass From 7d5681840e271199466c3bb57be56609c5ceb14d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 31 Jul 2024 19:20:58 +0100 Subject: [PATCH 002/173] Removed from_config method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 47 ++++++++++--------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index ce5e3269d1..290ca0796c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -81,6 +81,7 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) + # Add datasets to catalog def __iter__(self): yield from self.datasets.values() @@ -169,6 +170,7 @@ def _get_patterns( if cls._is_pattern(ds_name): dataset_patterns[ds_name] = resolved_ds_config else: + # TODO: Move to another method - see __init__ - add datasets to catalog cls.datasets[ds_name] = AbstractDataset.from_config( ds_name, resolved_ds_config, @@ -183,17 +185,6 @@ def _get_patterns( last_pattern = sorted_patterns.popitem() user_default = {last_pattern[0]: last_pattern[1]} - missing_keys = [ - key - for key in load_versions.keys() - if not (key in config or cls._match_pattern(sorted_patterns, key)) - ] - if missing_keys: - raise DatasetNotFoundError( - f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " - f"are not found in the catalog." - ) - return sorted_patterns, user_default @classmethod @@ -208,16 +199,6 @@ def resolve_patterns(self, datasets: str | list[str], **kwargs): # and self.resolved_config with resolved config pass - @classmethod - def from_config( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None = None, - **kwargs, - ) -> AbstractDataCatalog: - # Create a data catalog from configuration. - pass - @abc.abstractmethod def get_dataset(self, dataset_name: str, **kwargs) -> Any: self.resolve_patterns(dataset_name, **kwargs) @@ -230,27 +211,29 @@ def get_dataset_config(self, dataset_name: str) -> dict: class KedroDataCatalog(AbstractDataCatalog): - def __init__( + def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, config: dict[str, dict[str, Any]] | None = None, + credentials: dict[str, dict[str, Any]] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> None: - super().__init__(datasets, config) + super().__init__(datasets, config, credentials) self._load_versions = load_versions or {} self._save_version = save_version - @classmethod - def from_config( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> KedroDataCatalog: - pass + missing_keys = [ + key + for key in load_versions.keys() + if not (key in config or self._match_pattern(self._dataset_patterns, key)) + ] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) def resolve_patterns( self, From 0b80f23ae4534daec9524895f0b95a8218437f61 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 16:43:00 +0100 Subject: [PATCH 003/173] Implemented _init_datasets method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 75 ++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 290ca0796c..0db539b3b1 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -65,6 +65,15 @@ def _map_value(key: str, value: Any) -> Any: return {k: _map_value(k, v) for k, v in config.items()} +def validate_dataset_config(ds_name: str, ds_config: Any) -> None: + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + class AbstractDataCatalog: datasets = None @@ -81,7 +90,6 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - # Add datasets to catalog def __iter__(self): yield from self.datasets.values() @@ -143,13 +151,28 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} + @classmethod + def _init_datasets( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + for ds_name, ds_config in config.items(): + if not cls._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + cls.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + ) + @classmethod def _get_patterns( cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, ) -> tuple[Patterns, Patterns]: dataset_patterns = {} config = copy.deepcopy(config) or {} @@ -157,26 +180,12 @@ def _get_patterns( user_default = {} for ds_name, ds_config in config.items(): - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) if cls._is_pattern(ds_name): - dataset_patterns[ds_name] = resolved_ds_config - else: - # TODO: Move to another method - see __init__ - add datasets to catalog - cls.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - load_versions.get(ds_name), - save_version, + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials ) + dataset_patterns[ds_name] = resolved_ds_config sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -211,6 +220,9 @@ def get_dataset_config(self, dataset_name: str) -> dict: class KedroDataCatalog(AbstractDataCatalog): + _save_version = None + _load_versions = None + def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, @@ -224,6 +236,8 @@ def __init__( # noqa: PLR0913 self._load_versions = load_versions or {} self._save_version = save_version + self._init_datasets(config, credentials) + missing_keys = [ key for key in load_versions.keys() @@ -235,6 +249,25 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) + @classmethod + def _init_datasets( + self, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + for ds_name, ds_config in config.items(): + if not self._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + self.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + self._load_versions.get(ds_name), + self._save_version, + ) + def resolve_patterns( self, datasets: str | list[str], From 5c727dfd8ccb79464b486f306762511ef436329d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 17:12:18 +0100 Subject: [PATCH 004/173] Implemented get dataset Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 76 +++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 0db539b3b1..eab64793af 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -2,12 +2,19 @@ import abc import copy +import difflib import re from typing import Any from parse import parse -from kedro.io.core import AbstractDataset, DatasetError, DatasetNotFoundError, Version +from kedro.io.core import ( + AbstractDataset, + AbstractVersionedDataset, + DatasetError, + DatasetNotFoundError, + Version, +) Patterns = dict[str, dict[str, Any]] @@ -151,19 +158,18 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} - @classmethod def _init_datasets( - cls, + self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): - if not cls._is_pattern(ds_name): + if not self._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) resolved_ds_config = _resolve_credentials( # noqa: PLW2901 ds_config, credentials ) - cls.datasets[ds_name] = AbstractDataset.from_config( + self.datasets[ds_name] = AbstractDataset.from_config( ds_name, resolved_ds_config, ) @@ -198,20 +204,55 @@ def _get_patterns( @classmethod def _resolve_config( - cls, dataset_name: str, matched_pattern: str, config: dict + cls, + dataset_name: str, + matched_pattern: str, + config: dict, ) -> dict[str, Any]: - # get resolved dataset config - pass + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, dataset_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = cls._resolve_config(dataset_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + cls._resolve_config(dataset_name, matched_pattern, value) + for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise DatasetError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + @abc.abstractmethod def resolve_patterns(self, datasets: str | list[str], **kwargs): # Logic to resolve patterns and extend self.datasets with resolved names # and self.resolved_config with resolved config pass @abc.abstractmethod - def get_dataset(self, dataset_name: str, **kwargs) -> Any: + def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: self.resolve_patterns(dataset_name, **kwargs) - # Specific dataset type logic + + if dataset_name not in self.datasets: + error_msg = f"Dataset '{dataset_name}' not found in the catalog" + + # Flag to turn on/off fuzzy-matching which can be time consuming and + # slow down plugins like `kedro-viz` + if suggest: + matches = difflib.get_close_matches(dataset_name, self.datasets.keys()) + if matches: + suggestions = ", ".join(matches) + error_msg += f" - did you mean one of these instead: {suggestions}" + raise DatasetNotFoundError(error_msg) + + return self.datasets[dataset_name] @abc.abstractmethod def get_dataset_config(self, dataset_name: str) -> dict: @@ -249,7 +290,6 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) - @classmethod def _init_datasets( self, config: dict[str, dict[str, Any]] | None, @@ -277,10 +317,16 @@ def resolve_patterns( super().resolve_patterns(datasets) # KedroDataCatalog related logic - def get_dataset(self, dataset_name: str, **kwargs) -> AbstractDataset: - super().get_dataset(dataset_name, **kwargs) - dataset = self.datasets[dataset_name] - # Version related logic + def get_dataset( + self, dataset_name: str, suggest: bool = True, version: Version | None = None + ) -> AbstractDataset: + dataset = super().get_dataset(dataset_name, suggest) + + if version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=version) + return dataset def get_dataset_config(self, dataset_name: str) -> dict: From 05c9171f313b1a5918fde6d437a97bac9d297059 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 18:02:35 +0100 Subject: [PATCH 005/173] Started resolve_patterns implementation Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 48 +++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index eab64793af..ab37dba061 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -48,7 +48,7 @@ def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] + config: dict[str, Any], credentials: dict[str, Any] | None ) -> dict[str, Any]: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -93,10 +93,20 @@ def __init__( self.config = config or {} self.resolved_ds_configs = {} self.datasets = datasets or {} + self._dataset_patterns = {} + self._default_pattern = {} - self._dataset_patterns, self._default_pattern = self._get_patterns( - config, credentials - ) + # TODO: save resolved configs for two cases + + if config: + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + # Init datasets + + # TODO: resolve patterns - old init from constructor + if datasets: + pass def __iter__(self): yield from self.datasets.values() @@ -160,7 +170,7 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] def _init_datasets( self, - config: dict[str, dict[str, Any]] | None, + config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): @@ -231,10 +241,30 @@ def _resolve_config( return config @abc.abstractmethod - def resolve_patterns(self, datasets: str | list[str], **kwargs): - # Logic to resolve patterns and extend self.datasets with resolved names - # and self.resolved_config with resolved config - pass + def resolve_patterns( + self, datasets: str | list[str], **kwargs + ) -> dict[str, Any] | list[dict[str, Any]]: + if isinstance(datasets, str): + datasets = [datasets] + + # resolved_configs = [] + # + # for dataset_name in datasets: + # matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) + # + # if dataset_name not in self.datasets and matched_pattern: + # # If the dataset is a patterned dataset, materialise it and add it to + # # the catalog + # # TODO: Check how to save all resolved datasets configurations + # config_copy = copy.deepcopy( + # self._dataset_patterns.get(matched_pattern) + # or self._default_pattern.get(matched_pattern) + # or {} + # ) + # + # dataset_config = self._resolve_config( + # dataset_name, matched_pattern, config_copy + # ) @abc.abstractmethod def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: From 5c804d6ca911dc9c21b045ee4441eb25358000c4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 5 Aug 2024 18:07:45 +0100 Subject: [PATCH 006/173] Implemented resolve_patterns Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 219 +++++++++++++++--------------- 1 file changed, 113 insertions(+), 106 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index ab37dba061..6268d0842c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -81,7 +81,33 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: ) -class AbstractDataCatalog: +def _resolve_config( + dataset_name: str, + matched_pattern: str, + config: dict, +) -> dict[str, Any]: + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, dataset_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = _resolve_config(dataset_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + _resolve_config(dataset_name, matched_pattern, value) for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise DatasetError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + + +class AbstractDataCatalog(abc.ABC): datasets = None def __init__( @@ -90,27 +116,37 @@ def __init__( config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = config or {} + self.config = {} self.resolved_ds_configs = {} self.datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} - # TODO: save resolved configs for two cases + if datasets: + for ds_name in datasets: + self.resolved_ds_configs[ds_name] = {} if config: self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - # Init datasets - - # TODO: resolve patterns - old init from constructor - if datasets: - pass + self._update_ds_configs(config) + self._init_datasets(config, credentials) def __iter__(self): yield from self.datasets.values() + def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: + for ds_name, ds_config in config.items(): + if ds_name in self._dataset_patterns: + self.resolved_ds_configs[ds_name] = _resolve_config( + ds_name, ds_name, self._dataset_patterns[ds_name] + ) + else: + self.resolved_ds_configs[ds_name] = _resolve_config( + ds_name, ds_name, ds_config + ) + @staticmethod def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" @@ -168,9 +204,16 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} + @abc.abstractmethod + def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the '_init_dataset' method" + ) + def _init_datasets( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): @@ -179,10 +222,7 @@ def _init_datasets( resolved_ds_config = _resolve_credentials( # noqa: PLW2901 ds_config, credentials ) - self.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - ) + self._init_dataset(ds_name, resolved_ds_config) @classmethod def _get_patterns( @@ -212,82 +252,74 @@ def _get_patterns( return sorted_patterns, user_default - @classmethod - def _resolve_config( - cls, - dataset_name: str, - matched_pattern: str, - config: dict, - ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = cls._resolve_config(dataset_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [ - cls._resolve_config(dataset_name, matched_pattern, value) - for value in config - ] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - @abc.abstractmethod def resolve_patterns( - self, datasets: str | list[str], **kwargs + self, datasets: str | list[str], suggest: bool = True ) -> dict[str, Any] | list[dict[str, Any]]: if isinstance(datasets, str): - datasets = [datasets] - - # resolved_configs = [] - # - # for dataset_name in datasets: - # matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) - # - # if dataset_name not in self.datasets and matched_pattern: - # # If the dataset is a patterned dataset, materialise it and add it to - # # the catalog - # # TODO: Check how to save all resolved datasets configurations - # config_copy = copy.deepcopy( - # self._dataset_patterns.get(matched_pattern) - # or self._default_pattern.get(matched_pattern) - # or {} - # ) - # - # dataset_config = self._resolve_config( - # dataset_name, matched_pattern, config_copy - # ) + datasets_lst = [datasets] + else: + datasets_lst = datasets + + resolved_configs = [] + + for ds_name in datasets_lst: + matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) + if matched_pattern: + if ds_name not in self.datasets: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, + ) + resolved_configs.append(ds_config) + else: + resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + else: + resolved_configs.append(None) - @abc.abstractmethod - def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: - self.resolve_patterns(dataset_name, **kwargs) + if isinstance(datasets, str): + return resolved_configs[0] + else: + return resolved_configs + + def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: + ds_config = self.resolve_patterns(ds_name) - if dataset_name not in self.datasets: - error_msg = f"Dataset '{dataset_name}' not found in the catalog" + if ds_config is None: + error_msg = f"Dataset '{ds_name}' not found in the catalog" # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(dataset_name, self.datasets.keys()) + matches = difflib.get_close_matches(ds_name, self.datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) + elif ds_name not in self.datasets: + self._init_dataset(ds_name, ds_config) + self.resolved_ds_configs[ds_name] = ds_config - return self.datasets[dataset_name] + return self.datasets[ds_name] - @abc.abstractmethod - def get_dataset_config(self, dataset_name: str) -> dict: - # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns - pass + def get_dataset_config(self, ds_name: str) -> dict | None: + if ds_name in self.resolved_ds_configs: + return self.resolved_ds_configs[ds_name] + return None class KedroDataCatalog(AbstractDataCatalog): @@ -307,8 +339,6 @@ def __init__( # noqa: PLR0913 self._load_versions = load_versions or {} self._save_version = save_version - self._init_datasets(config, credentials) - missing_keys = [ key for key in load_versions.keys() @@ -320,32 +350,13 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) - def _init_datasets( - self, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, - ) -> None: - for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) - self.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def resolve_patterns( - self, - datasets: str | list[str], - version: Version | None = None, - suggest: bool = True, - ) -> None: - super().resolve_patterns(datasets) - # KedroDataCatalog related logic + def _init_dataset(self, ds_name: str, config: dict[str, Any]): + self.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + config, + self._load_versions.get(ds_name), + self._save_version, + ) def get_dataset( self, dataset_name: str, suggest: bool = True, version: Version | None = None @@ -358,7 +369,3 @@ def get_dataset( dataset = dataset._copy(_version=version) return dataset - - def get_dataset_config(self, dataset_name: str) -> dict: - # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns - pass From 530f7d60a3d312688063a2f7b94886ed821451c3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 5 Aug 2024 20:21:10 +0100 Subject: [PATCH 007/173] Fixed credentials resolving Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 68 +++++++++++++++++-------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6268d0842c..346e422822 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -130,13 +130,19 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - self._update_ds_configs(config) + self._update_ds_configs(config, credentials) self._init_datasets(config, credentials) def __iter__(self): yield from self.datasets.values() - def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: + def _update_ds_configs( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if ds_name in self._dataset_patterns: self.resolved_ds_configs[ds_name] = _resolve_config( @@ -144,7 +150,7 @@ def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: ) else: self.resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, ds_config + ds_name, ds_name, _resolve_credentials(ds_config, credentials) ) @staticmethod @@ -253,7 +259,7 @@ def _get_patterns( return sorted_patterns, user_default def resolve_patterns( - self, datasets: str | list[str], suggest: bool = True + self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: if isinstance(datasets, str): datasets_lst = [datasets] @@ -264,30 +270,29 @@ def resolve_patterns( for ds_name in datasets_lst: matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) - if matched_pattern: - if ds_name not in self.datasets: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or {} + if matched_pattern and ds_name not in self.datasets: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) - - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - else: - resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + resolved_configs.append(ds_config) + elif ds_name in self.datasets: + resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) else: resolved_configs.append(None) @@ -334,14 +339,17 @@ def __init__( # noqa: PLR0913 load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> None: - super().__init__(datasets, config, credentials) - self._load_versions = load_versions or {} self._save_version = save_version + super().__init__(datasets, config, credentials) + + # print(self.datasets) + # print(self.resolved_ds_configs) + missing_keys = [ key - for key in load_versions.keys() + for key in self._load_versions.keys() if not (key in config or self._match_pattern(self._dataset_patterns, key)) ] if missing_keys: From 64be83cab898099b8299a9588c741e4711c798ac Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 11:57:43 +0100 Subject: [PATCH 008/173] Updated match pattern Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 346e422822..37150d3b1c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -158,12 +158,11 @@ def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern - @staticmethod - def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: + def match_pattern(self, dataset_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" matches = ( pattern - for pattern in dataset_patterns.keys() + for pattern in self._dataset_patterns.keys() if parse(pattern, dataset_name) ) return next(matches, None) @@ -269,7 +268,7 @@ def resolve_patterns( resolved_configs = [] for ds_name in datasets_lst: - matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) + matched_pattern = self.match_pattern(ds_name) if matched_pattern and ds_name not in self.datasets: # If the dataset is a patterned dataset, materialise it and add it to # the catalog @@ -345,12 +344,17 @@ def __init__( # noqa: PLR0913 super().__init__(datasets, config, credentials) # print(self.datasets) + # print("-") # print(self.resolved_ds_configs) + # print("-") + # print(self._dataset_patterns) + # print("-") + # print(self._default_pattern) missing_keys = [ key for key in self._load_versions.keys() - if not (key in config or self._match_pattern(self._dataset_patterns, key)) + if not (key in config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( From c29828a00bb04cd8e16649c7c57fa5eaf33bb933 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 13:41:21 +0100 Subject: [PATCH 009/173] Implemented add from dict method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 61 ++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 37150d3b1c..6bd33a6658 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -3,6 +3,7 @@ import abc import copy import difflib +import logging import re from typing import Any @@ -11,10 +12,12 @@ from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, + DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, Version, ) +from kedro.io.memory_dataset import MemoryDataset Patterns = dict[str, dict[str, Any]] @@ -112,7 +115,7 @@ class AbstractDataCatalog(abc.ABC): def __init__( self, - datasets: dict[str, AbstractDataset] | None = None, + datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: @@ -169,15 +172,7 @@ def match_pattern(self, dataset_name: str) -> str | None: @staticmethod def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets. - - Example: - :: - - >>> specificity("{namespace}.companies") = 10 - >>> specificity("{namespace}.{dataset}") = 1 - >>> specificity("france.companies") = 16 - """ + """Helper function to check the length of exactly matched characters not inside brackets.""" # Remove all the placeholders from the pattern and count the number of remaining chars result = re.sub(r"\{.*?\}", "", pattern) return len(result) @@ -320,10 +315,25 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: return self.datasets[ds_name] - def get_dataset_config(self, ds_name: str) -> dict | None: - if ds_name in self.resolved_ds_configs: - return self.resolved_ds_configs[ds_name] - return None + @abc.abstractmethod + def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the '_init_dataset' method" + ) + + def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: + """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + if dataset_name in self.datasets: + raise DatasetAlreadyExistsError( + f"Dataset '{dataset_name}' has already been registered" + ) + self.datasets[dataset_name] = dataset + self.resolved_ds_configs[dataset_name] = {} + + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) class KedroDataCatalog(AbstractDataCatalog): @@ -381,3 +391,26 @@ def get_dataset( dataset = dataset._copy(_version=version) return dataset + + def add( + self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + ) -> None: + """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + if dataset_name in self.datasets: + if replace: + self._logger.warning("Replacing dataset '%s'", dataset_name) + else: + raise DatasetAlreadyExistsError( + f"Dataset '{dataset_name}' has already been registered" + ) + self.datasets[dataset_name] = dataset + self.resolved_ds_configs[dataset_name] = {} + + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + for ds_name in datasets: + if isinstance(datasets[ds_name], AbstractDataset): + dataset = datasets[ds_name] + else: + dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] + + self.add(ds_name, dataset, replace) From 957403a080cae4d063c2ac636800a09e380e2021 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 15:31:13 +0100 Subject: [PATCH 010/173] Updated io __init__ Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 7902f866bd..c907a66136 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -13,6 +13,7 @@ Version, ) from .data_catalog import DataCatalog +from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -29,4 +30,6 @@ "MemoryDataset", "SharedMemoryDataset", "Version", + "AbstractDataCatalog", + "KedroDataCatalog", ] From 14908ff9d44bba0aaa6e7fa1b0ded13f4ad3d0b1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 15:34:21 +0100 Subject: [PATCH 011/173] Added list method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6bd33a6658..45520e8c51 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -335,6 +335,29 @@ def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: def _logger(self) -> logging.Logger: return logging.getLogger(__name__) + def list(self, regex_search: str | None = None) -> list[str]: + """ + List of all dataset names registered in the catalog. + This can be filtered by providing an optional regular expression + which will only return matching keys. + """ + + if regex_search is None: + return list(self.datasets.keys()) + + if not regex_search.strip(): + self._logger.warning("The empty string will not match any data sets") + return [] + + try: + pattern = re.compile(regex_search, flags=re.IGNORECASE) + + except re.error as exc: + raise SyntaxError( + f"Invalid regular expression provided: '{regex_search}'" + ) from exc + return [ds_name for ds_name in self.datasets if pattern.search(ds_name)] + class KedroDataCatalog(AbstractDataCatalog): _save_version = None From c5e925bad2cce4e8fa537068c4c6d79a39460070 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 23:00:39 +0100 Subject: [PATCH 012/173] Implemented _validate_missing_keys Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 45520e8c51..7c79ef7b69 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -111,15 +111,13 @@ def _resolve_config( class AbstractDataCatalog(abc.ABC): - datasets = None - def __init__( self, datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = {} + self.config = config or {} self.resolved_ds_configs = {} self.datasets = datasets or {} self._dataset_patterns = {} @@ -360,9 +358,6 @@ def list(self, regex_search: str | None = None) -> list[str]: class KedroDataCatalog(AbstractDataCatalog): - _save_version = None - _load_versions = None - def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, @@ -376,18 +371,13 @@ def __init__( # noqa: PLR0913 super().__init__(datasets, config, credentials) - # print(self.datasets) - # print("-") - # print(self.resolved_ds_configs) - # print("-") - # print(self._dataset_patterns) - # print("-") - # print(self._default_pattern) + self._validate_missing_keys() + def _validate_missing_keys(self) -> None: missing_keys = [ key for key in self._load_versions.keys() - if not (key in config or self.match_pattern(key)) + if not (key in self.config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( From b9a92b0173ea934975e5a289df112960520b6332 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 7 Aug 2024 15:05:58 +0100 Subject: [PATCH 013/173] Added datasets access logic Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 80 +++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 7c79ef7b69..6e15cec62c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -117,15 +117,15 @@ def __init__( config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = config or {} - self.resolved_ds_configs = {} - self.datasets = datasets or {} + self._config = config or {} + self._resolved_ds_configs = {} + self._datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} if datasets: for ds_name in datasets: - self.resolved_ds_configs[ds_name] = {} + self._resolved_ds_configs[ds_name] = {} if config: self._dataset_patterns, self._default_pattern = self._get_patterns( @@ -134,8 +134,40 @@ def __init__( self._update_ds_configs(config, credentials) self._init_datasets(config, credentials) + @property + def datasets(self): + return copy.deepcopy(self._datasets) + + @datasets.setter + def datasets(self, value: Any): + msg = "Operation not allowed! Please change datasets through configuration." + raise AttributeError(msg) + + @property + def resolved_ds_configs(self): + return copy.deepcopy(self._resolved_ds_configs) + + @resolved_ds_configs.setter + def resolved_ds_configs(self, value: Any): + msg = "Operation not allowed! Please change datasets through configuration." + raise AttributeError(msg) + + @property + def dataset_patterns(self): + return self._dataset_patterns + + @property + def default_pattern(self): + return self._default_pattern + def __iter__(self): - yield from self.datasets.values() + yield from self._datasets.values() + + def __getitem__(self, ds_name: str) -> Any: + return self.get_dataset(ds_name) + + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) def _update_ds_configs( self, @@ -146,11 +178,11 @@ def _update_ds_configs( credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if ds_name in self._dataset_patterns: - self.resolved_ds_configs[ds_name] = _resolve_config( + self._resolved_ds_configs[ds_name] = _resolve_config( ds_name, ds_name, self._dataset_patterns[ds_name] ) else: - self.resolved_ds_configs[ds_name] = _resolve_config( + self._resolved_ds_configs[ds_name] = _resolve_config( ds_name, ds_name, _resolve_credentials(ds_config, credentials) ) @@ -262,7 +294,7 @@ def resolve_patterns( for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self.datasets: + if matched_pattern and ds_name not in self._datasets: # If the dataset is a patterned dataset, materialise it and add it to # the catalog config_copy = copy.deepcopy( @@ -283,8 +315,8 @@ def resolve_patterns( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self.datasets: - resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + elif ds_name in self._datasets: + resolved_configs.append(self._resolved_ds_configs.get(ds_name, {})) else: resolved_configs.append(None) @@ -302,16 +334,16 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self.datasets.keys()) + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) - elif ds_name not in self.datasets: + elif ds_name not in self._datasets: self._init_dataset(ds_name, ds_config) - self.resolved_ds_configs[ds_name] = ds_config + self._resolved_ds_configs[ds_name] = ds_config - return self.datasets[ds_name] + return self._datasets[ds_name] @abc.abstractmethod def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: @@ -322,12 +354,12 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self.datasets: + if dataset_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{dataset_name}' has already been registered" ) - self.datasets[dataset_name] = dataset - self.resolved_ds_configs[dataset_name] = {} + self._datasets[dataset_name] = dataset + self._resolved_ds_configs[dataset_name] = {} @property def _logger(self) -> logging.Logger: @@ -341,7 +373,7 @@ def list(self, regex_search: str | None = None) -> list[str]: """ if regex_search is None: - return list(self.datasets.keys()) + return list(self._datasets.keys()) if not regex_search.strip(): self._logger.warning("The empty string will not match any data sets") @@ -354,7 +386,7 @@ def list(self, regex_search: str | None = None) -> list[str]: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [ds_name for ds_name in self.datasets if pattern.search(ds_name)] + return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] class KedroDataCatalog(AbstractDataCatalog): @@ -377,7 +409,7 @@ def _validate_missing_keys(self) -> None: missing_keys = [ key for key in self._load_versions.keys() - if not (key in self.config or self.match_pattern(key)) + if not (key in self._config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( @@ -386,7 +418,7 @@ def _validate_missing_keys(self) -> None: ) def _init_dataset(self, ds_name: str, config: dict[str, Any]): - self.datasets[ds_name] = AbstractDataset.from_config( + self._datasets[ds_name] = AbstractDataset.from_config( ds_name, config, self._load_versions.get(ds_name), @@ -409,15 +441,15 @@ def add( self, dataset_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self.datasets: + if dataset_name in self._datasets: if replace: self._logger.warning("Replacing dataset '%s'", dataset_name) else: raise DatasetAlreadyExistsError( f"Dataset '{dataset_name}' has already been registered" ) - self.datasets[dataset_name] = dataset - self.resolved_ds_configs[dataset_name] = {} + self._datasets[dataset_name] = dataset + self._resolved_ds_configs[dataset_name] = {} def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: for ds_name in datasets: From 2f3259378e92bc10b023c82b4ce9ceb1387046f5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 7 Aug 2024 19:10:43 +0100 Subject: [PATCH 014/173] Added __contains__ and comments on lazy loading Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6e15cec62c..775264115f 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -166,6 +166,13 @@ def __iter__(self): def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) + def __contains__(self, dataset_name: str) -> bool: + """Check if an item is in the catalog as a materialised dataset or pattern""" + matched_pattern = self.match_pattern(dataset_name) + if dataset_name in self._datasets or matched_pattern: + return True + return False + def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) @@ -249,9 +256,7 @@ def _init_datasets( for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) + resolved_ds_config = _resolve_credentials(ds_config, credentials) self._init_dataset(ds_name, resolved_ds_config) @classmethod @@ -268,9 +273,7 @@ def _get_patterns( for ds_name, ds_config in config.items(): if cls._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) + resolved_ds_config = _resolve_credentials(ds_config, credentials) dataset_patterns[ds_name] = resolved_ds_config sorted_patterns = cls._sort_patterns(dataset_patterns) @@ -418,6 +421,9 @@ def _validate_missing_keys(self) -> None: ) def _init_dataset(self, ds_name: str, config: dict[str, Any]): + # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Initialise actual dataset when load or save + # Add is_ds_init property self._datasets[ds_name] = AbstractDataset.from_config( ds_name, config, From d1ea64ec59a71a0907e92b1f53e83fcce39813c8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 8 Aug 2024 11:57:23 +0100 Subject: [PATCH 015/173] Renamed dataset_name to ds_name Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 46 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 775264115f..82e40cb214 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -85,20 +85,18 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: def _resolve_config( - dataset_name: str, + ds_name: str, matched_pattern: str, config: dict, ) -> dict[str, Any]: """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) + result = parse(matched_pattern, ds_name) # Resolve the factory config for the dataset if isinstance(config, dict): for key, value in config.items(): - config[key] = _resolve_config(dataset_name, matched_pattern, value) + config[key] = _resolve_config(ds_name, matched_pattern, value) elif isinstance(config, (list, tuple)): - config = [ - _resolve_config(dataset_name, matched_pattern, value) for value in config - ] + config = [_resolve_config(ds_name, matched_pattern, value) for value in config] elif isinstance(config, str) and "}" in config: try: config = str(config).format_map(result.named) @@ -166,10 +164,10 @@ def __iter__(self): def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) - def __contains__(self, dataset_name: str) -> bool: + def __contains__(self, ds_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self.match_pattern(dataset_name) - if dataset_name in self._datasets or matched_pattern: + matched_pattern = self.match_pattern(ds_name) + if ds_name in self._datasets or matched_pattern: return True return False @@ -198,12 +196,12 @@ def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern - def match_pattern(self, dataset_name: str) -> str | None: + def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" matches = ( pattern for pattern in self._dataset_patterns.keys() - if parse(pattern, dataset_name) + if parse(pattern, ds_name) ) return next(matches, None) @@ -355,14 +353,14 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: f"it must implement the '_init_dataset' method" ) - def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: + def add(self, ds_name: str, dataset: Any, **kwargs) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self._datasets: + if ds_name in self._datasets: raise DatasetAlreadyExistsError( - f"Dataset '{dataset_name}' has already been registered" + f"Dataset '{ds_name}' has already been registered" ) - self._datasets[dataset_name] = dataset - self._resolved_ds_configs[dataset_name] = {} + self._datasets[ds_name] = dataset + self._resolved_ds_configs[ds_name] = {} @property def _logger(self) -> logging.Logger: @@ -432,9 +430,9 @@ def _init_dataset(self, ds_name: str, config: dict[str, Any]): ) def get_dataset( - self, dataset_name: str, suggest: bool = True, version: Version | None = None + self, ds_name: str, suggest: bool = True, version: Version | None = None ) -> AbstractDataset: - dataset = super().get_dataset(dataset_name, suggest) + dataset = super().get_dataset(ds_name, suggest) if version and isinstance(dataset, AbstractVersionedDataset): # we only want to return a similar-looking dataset, @@ -444,18 +442,18 @@ def get_dataset( return dataset def add( - self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self._datasets: + if ds_name in self._datasets: if replace: - self._logger.warning("Replacing dataset '%s'", dataset_name) + self._logger.warning("Replacing dataset '%s'", ds_name) else: raise DatasetAlreadyExistsError( - f"Dataset '{dataset_name}' has already been registered" + f"Dataset '{ds_name}' has already been registered" ) - self._datasets[dataset_name] = dataset - self._resolved_ds_configs[dataset_name] = {} + self._datasets[ds_name] = dataset + self._resolved_ds_configs[ds_name] = {} def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: for ds_name in datasets: From fb89fca9041cc0aff46ce6073ca570da553652c8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 8 Aug 2024 12:00:13 +0100 Subject: [PATCH 016/173] Updated some docstrings Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 82e40cb214..3bef34bd53 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -354,7 +354,7 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: ) def add(self, ds_name: str, dataset: Any, **kwargs) -> None: - """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + """Adds a new dataset object to the ``AbstractDataCatalog``.""" if ds_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{ds_name}' has already been registered" @@ -444,7 +444,7 @@ def get_dataset( def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: - """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" if ds_name in self._datasets: if replace: self._logger.warning("Replacing dataset '%s'", ds_name) From c6676459338507468dd4deba4066d71eff0ee9ef Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 17:32:20 +0100 Subject: [PATCH 017/173] Fixed _update_ds_configs Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 3bef34bd53..a8d929a8e5 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -182,13 +182,9 @@ def _update_ds_configs( config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): - if ds_name in self._dataset_patterns: - self._resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, self._dataset_patterns[ds_name] - ) - else: - self._resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, _resolve_credentials(ds_config, credentials) + if not self._is_pattern(ds_name): + self._resolved_ds_configs[ds_name] = _resolve_credentials( + ds_config, credentials ) @staticmethod From be8e929f2553a9b7d127034b45be6b3b63ee6d04 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 18:47:21 +0100 Subject: [PATCH 018/173] Fixed _init_datasets Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index a8d929a8e5..fc37e2f3ec 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -24,6 +24,14 @@ CREDENTIALS_KEY = "credentials" +class DatasetConfigurationNotFoundError(DatasetError): + """``DatasetConfigurationNotFoundError`` raised by ``DataCatalog`` class in case of + trying to get non-existing dataset configuration. + """ + + pass + + def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: """Return a set of credentials from the provided credentials dict. @@ -130,7 +138,7 @@ def __init__( config, credentials ) self._update_ds_configs(config, credentials) - self._init_datasets(config, credentials) + self._init_datasets(config) @property def datasets(self): @@ -183,6 +191,7 @@ def _update_ds_configs( credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) self._resolved_ds_configs[ds_name] = _resolve_credentials( ds_config, credentials ) @@ -245,13 +254,15 @@ def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: def _init_datasets( self, config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, ) -> None: - for ds_name, ds_config in config.items(): + for ds_name in config: if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials(ds_config, credentials) - self._init_dataset(ds_name, resolved_ds_config) + ds_resolved_config = self._resolved_ds_configs.get(ds_name, None) + if not ds_resolved_config: + raise DatasetConfigurationNotFoundError( + f"Dataset '{ds_name}' configuration is missing." + ) + self._init_dataset(ds_name, ds_resolved_config) @classmethod def _get_patterns( @@ -346,7 +357,7 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: raise NotImplementedError( f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the '_init_dataset' method" + f"it must implement the 'add_from_dict' method" ) def add(self, ds_name: str, dataset: Any, **kwargs) -> None: From ec7ac394f5291c870f18549b70669be08f72d959 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 19:15:12 +0100 Subject: [PATCH 019/173] Implemented add_runtime_patterns Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index fc37e2f3ec..1fbce495fe 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -128,6 +128,7 @@ def __init__( self._datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} + self._runtime_patterns = {} if datasets: for ds_name in datasets: @@ -203,11 +204,10 @@ def _is_pattern(pattern: str) -> bool: def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" - matches = ( - pattern - for pattern in self._dataset_patterns.keys() - if parse(pattern, ds_name) - ) + all_patterns = list(self._dataset_patterns.keys()) + all_patterns.extend(list(self._default_pattern.keys())) + all_patterns.extend(list(self._runtime_patterns.keys())) + matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) @staticmethod @@ -348,8 +348,8 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) elif ds_name not in self._datasets: - self._init_dataset(ds_name, ds_config) self._resolved_ds_configs[ds_name] = ds_config + self._init_dataset(ds_name, ds_config) return self._datasets[ds_name] @@ -396,6 +396,10 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} + self._runtime_patterns = self._sort_patterns(self._runtime_patterns) + class KedroDataCatalog(AbstractDataCatalog): def __init__( # noqa: PLR0913 From 8e234507ec305d126294b773c1c5849f86b5d6e7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 13 Aug 2024 12:10:12 +0100 Subject: [PATCH 020/173] Fixed runtime patterns usage Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 131 ++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 1fbce495fe..c43f221984 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -18,6 +18,7 @@ Version, ) from kedro.io.memory_dataset import MemoryDataset +from kedro.logging import _format_rich, _has_rich_handler Patterns = dict[str, dict[str, Any]] @@ -308,6 +309,7 @@ def resolve_patterns( config_copy = copy.deepcopy( self._dataset_patterns.get(matched_pattern) or self._default_pattern.get(matched_pattern) + or self._runtime_patterns.get(matched_pattern) or {} ) ds_config = _resolve_config(ds_name, matched_pattern, config_copy) @@ -396,6 +398,61 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + @abc.abstractmethod + def load(self, name: str, **kwargs) -> Any: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the 'load' method" + ) + + def save(self, name: str, data: Any) -> None: + """Save data to a registered data set. + + Args: + name: A data set to be saved to. + data: A data object to be saved as configured in the registered + data set. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + + Example: + :: + + >>> import pandas as pd + >>> + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = pd.DataFrame({'col1': [1, 2], + >>> 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> catalog.save("cars", df) + """ + dataset = self.get_dataset(name) + + self._logger.info( + "Saving data to %s (%s)...", + _format_rich(name, "dark_orange") + if _has_rich_handler(self._logger) + else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + dataset.save(data) + + def release(self, name: str) -> None: + pass + + def confirm(self, name: str) -> None: + pass + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} self._runtime_patterns = self._sort_patterns(self._runtime_patterns) @@ -474,3 +531,77 @@ def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] self.add(ds_name, dataset, replace) + + def load(self, name: str, version: str | None = None) -> Any: + """Loads a registered data set. + + Args: + name: A data set to be loaded. + version: Optional argument for concrete data version to be loaded. + Works only with versioned datasets. + + Returns: + The loaded data as configured. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + + Example: + :: + + >>> from kedro.io import DataCatalog + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = catalog.load("cars") + """ + load_version = Version(version, None) if version else None + dataset = self.get_dataset(name, version=load_version) + + self._logger.info( + "Loading data from %s (%s)...", + _format_rich(name, "dark_orange") + if _has_rich_handler(self._logger) + else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + result = dataset.load() + + return result + + def release(self, name: str) -> None: + """Release any cached data associated with a data set + + Args: + name: A data set to be checked. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + """ + dataset = self.get_dataset(name) + dataset.release() + + def confirm(self, name: str) -> None: + """Confirm a dataset by its name. + + Args: + name: Name of the dataset. + Raises: + DatasetError: When the dataset does not have `confirm` method. + + """ + self._logger.info("Confirming dataset '%s'", name) + dataset = self.get_dataset(name) + + if hasattr(dataset, "confirm"): + dataset.confirm() + else: + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") From 50bc8165297030890dd05d5c0da73c237fc604b8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 21 Aug 2024 16:20:13 +0100 Subject: [PATCH 021/173] Moved pattern logic out of data catalog, implemented KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/config/__init__.py | 2 + kedro/config/config_resolver.py | 237 ++++++++++++++++ kedro/io/__init__.py | 3 +- kedro/io/data_catalog_redesign.py | 455 ++++++------------------------ 4 files changed, 324 insertions(+), 373 deletions(-) create mode 100644 kedro/config/config_resolver.py diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index 500cd62615..9b47743bfe 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -7,11 +7,13 @@ BadConfigException, MissingConfigException, ) +from .config_resolver import ConfigResolver from .omegaconf_config import OmegaConfigLoader __all__ = [ "AbstractConfigLoader", "BadConfigException", + "ConfigResolver", "MissingConfigException", "OmegaConfigLoader", ] diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py new file mode 100644 index 0000000000..31fec7a339 --- /dev/null +++ b/kedro/config/config_resolver.py @@ -0,0 +1,237 @@ +import copy +import logging +import re +from typing import Any + +from parse import parse + +Patterns = dict[str, dict[str, Any]] + +CREDENTIALS_KEY = "credentials" + + +def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Return a set of credentials from the provided credentials dict. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + +def _resolve_credentials( + config: dict[str, Any], credentials: dict[str, Any] | None +) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _map_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return _get_credentials(value, credentials) + if isinstance(value, dict): + return {k: _map_value(k, v) for k, v in value.items()} + return value + + return {k: _map_value(k, v) for k, v in config.items()} + + +def _resolve_config( + ds_name: str, + matched_pattern: str, + config: dict, +) -> dict[str, Any]: + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, ds_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = _resolve_config(ds_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [_resolve_config(ds_name, matched_pattern, value) for value in config] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise KeyError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + + +class ConfigResolver: + def __init__( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None = None, + ): + self._runtime_patterns = {} + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + + self._ds_configs = self._get_ds_configs(config, credentials) + + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + + @staticmethod + def _is_pattern(pattern: str) -> bool: + """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" + return "{" in pattern + + @staticmethod + def _specificity(pattern: str) -> int: + """Helper function to check the length of exactly matched characters not inside brackets.""" + # Remove all the placeholders from the pattern and count the number of remaining chars + result = re.sub(r"\{.*?\}", "", pattern) + return len(result) + + @classmethod + def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules. + + In order: + + 1. Decreasing specificity (number of characters outside the curly brackets) + 2. Decreasing number of placeholders (number of curly bracket pairs) + 3. Alphabetically + """ + sorted_keys = sorted( + dataset_patterns, + key=lambda pattern: ( + -(cls._specificity(pattern)), + -pattern.count("{"), + pattern, + ), + ) + catch_all = [ + pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + ] + if len(catch_all) > 1: + raise ValueError( + f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." + ) + return {key: dataset_patterns[key] for key in sorted_keys} + + def match_pattern(self, ds_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + all_patterns = list(self._dataset_patterns.keys()) + all_patterns.extend(list(self._default_pattern.keys())) + all_patterns.extend(list(self._runtime_patterns.keys())) + matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) + return next(matches, None) + + @classmethod + def _get_patterns( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> tuple[Patterns, Patterns]: + dataset_patterns = {} + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + user_default = {} + + for ds_name, ds_config in config.items(): + if cls._is_pattern(ds_name): + resolved_ds_config = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = resolved_ds_config + + sorted_patterns = cls._sort_patterns(dataset_patterns) + if sorted_patterns: + # If the last pattern is a catch-all pattern, pop it and set it as the default + if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + last_pattern = sorted_patterns.popitem() + user_default = {last_pattern[0]: last_pattern[1]} + + return sorted_patterns, user_default + + def _get_ds_configs( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None, + ) -> dict[str, dict[str, Any]]: + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + ds_configs = {} + for ds_name, ds_config in config.items(): + if not self._is_pattern(ds_name): + ds_configs[ds_name] = _resolve_credentials(ds_config, credentials) + + return ds_configs + + def resolve_patterns( + self, datasets: str | list[str] + ) -> dict[str, Any] | list[dict[str, Any]]: + if isinstance(datasets, str): + datasets_lst = [datasets] + else: + datasets_lst = datasets + + resolved_configs = [] + + for ds_name in datasets_lst: + matched_pattern = self.match_pattern(ds_name) + if matched_pattern and ds_name not in self._ds_configs: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or self._runtime_patterns.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, + ) + resolved_configs.append(ds_config) + elif ds_name in self._ds_configs: + resolved_configs.append(self._ds_configs.get(ds_name)) + else: + resolved_configs.append(None) + + if isinstance(datasets, str): + return resolved_configs[0] + else: + return resolved_configs + + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} + self._runtime_patterns = self._sort_patterns(self._runtime_patterns) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 1bc285067f..df7880557e 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -14,7 +14,7 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog +from .data_catalog_redesign import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -31,6 +31,5 @@ "MemoryDataset", "SharedMemoryDataset", "Version", - "AbstractDataCatalog", "KedroDataCatalog", ] diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index c43f221984..cad4950c56 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -1,14 +1,11 @@ from __future__ import annotations -import abc import copy import difflib import logging import re from typing import Any -from parse import parse - from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -18,7 +15,7 @@ Version, ) from kedro.io.memory_dataset import MemoryDataset -from kedro.logging import _format_rich, _has_rich_handler +from kedro.utils import _format_rich, _has_rich_handler Patterns = dict[str, dict[str, Any]] @@ -33,57 +30,6 @@ class DatasetConfigurationNotFoundError(DatasetError): pass -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _map_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) - if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} - return value - - return {k: _map_value(k, v) for k, v in config.items()} - - def validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): raise DatasetError( @@ -93,55 +39,30 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: ) -def _resolve_config( - ds_name: str, - matched_pattern: str, - config: dict, -) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, ds_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = _resolve_config(ds_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [_resolve_config(ds_name, matched_pattern, value) for value in config] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - -class AbstractDataCatalog(abc.ABC): +class KedroDataCatalog: def __init__( self, datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, - credentials: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, ) -> None: self._config = config or {} - self._resolved_ds_configs = {} self._datasets = datasets or {} - self._dataset_patterns = {} - self._default_pattern = {} self._runtime_patterns = {} + self._load_versions = load_versions or {} + self._save_version = save_version if datasets: for ds_name in datasets: - self._resolved_ds_configs[ds_name] = {} + # TODO: API to get configuration from dataset + self._config[ds_name] = {} if config: - self._dataset_patterns, self._default_pattern = self._get_patterns( - config, credentials - ) - self._update_ds_configs(config, credentials) self._init_datasets(config) + self._validate_missing_keys() + @property def datasets(self): return copy.deepcopy(self._datasets) @@ -152,22 +73,14 @@ def datasets(self, value: Any): raise AttributeError(msg) @property - def resolved_ds_configs(self): - return copy.deepcopy(self._resolved_ds_configs) + def config(self): + return copy.deepcopy(self._config) - @resolved_ds_configs.setter - def resolved_ds_configs(self, value: Any): + @config.setter + def config(self, value: Any): msg = "Operation not allowed! Please change datasets through configuration." raise AttributeError(msg) - @property - def dataset_patterns(self): - return self._dataset_patterns - - @property - def default_pattern(self): - return self._default_pattern - def __iter__(self): yield from self._datasets.values() @@ -175,201 +88,74 @@ def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) def __contains__(self, ds_name: str) -> bool: - """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self.match_pattern(ds_name) - if ds_name in self._datasets or matched_pattern: + """Check if an item is in the catalog""" + if ds_name in self._datasets: return True return False def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def _update_ds_configs( - self, - config: dict[str, dict[str, Any]], - credentials: dict[str, dict[str, Any]] | None, - ) -> None: - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} - for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - self._resolved_ds_configs[ds_name] = _resolve_credentials( - ds_config, credentials - ) - - @staticmethod - def _is_pattern(pattern: str) -> bool: - """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" - return "{" in pattern - - def match_pattern(self, ds_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" - all_patterns = list(self._dataset_patterns.keys()) - all_patterns.extend(list(self._default_pattern.keys())) - all_patterns.extend(list(self._runtime_patterns.keys())) - matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) - return next(matches, None) - - @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets.""" - # Remove all the placeholders from the pattern and count the number of remaining chars - result = re.sub(r"\{.*?\}", "", pattern) - return len(result) - - @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: - """Sort a dictionary of dataset patterns according to parsing rules. - - In order: - - 1. Decreasing specificity (number of characters outside the curly brackets) - 2. Decreasing number of placeholders (number of curly bracket pairs) - 3. Alphabetically - """ - sorted_keys = sorted( - dataset_patterns, - key=lambda pattern: ( - -(cls._specificity(pattern)), - -pattern.count("{"), - pattern, - ), - ) - catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 - ] - if len(catch_all) > 1: - raise DatasetError( - f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." - ) - return {key: dataset_patterns[key] for key in sorted_keys} - - @abc.abstractmethod - def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the '_init_dataset' method" - ) - def _init_datasets( self, config: dict[str, dict[str, Any]] | None, ) -> None: - for ds_name in config: - if not self._is_pattern(ds_name): - ds_resolved_config = self._resolved_ds_configs.get(ds_name, None) - if not ds_resolved_config: - raise DatasetConfigurationNotFoundError( - f"Dataset '{ds_name}' configuration is missing." - ) - self._init_dataset(ds_name, ds_resolved_config) - - @classmethod - def _get_patterns( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, - ) -> tuple[Patterns, Patterns]: - dataset_patterns = {} - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} - user_default = {} - for ds_name, ds_config in config.items(): - if cls._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_ds_config - - sorted_patterns = cls._sort_patterns(dataset_patterns) - if sorted_patterns: - # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: - last_pattern = sorted_patterns.popitem() - user_default = {last_pattern[0]: last_pattern[1]} - - return sorted_patterns, user_default - - def resolve_patterns( - self, datasets: str | list[str] - ) -> dict[str, Any] | list[dict[str, Any]]: - if isinstance(datasets, str): - datasets_lst = [datasets] - else: - datasets_lst = datasets - - resolved_configs = [] - - for ds_name in datasets_lst: - matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._datasets: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or self._runtime_patterns.get(matched_pattern) - or {} - ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) - - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - elif ds_name in self._datasets: - resolved_configs.append(self._resolved_ds_configs.get(ds_name, {})) - else: - resolved_configs.append(None) + validate_dataset_config(ds_name, ds_config) + self._init_dataset(ds_name, ds_config) - if isinstance(datasets, str): - return resolved_configs[0] - else: - return resolved_configs + def _init_dataset(self, ds_name: str, config: dict[str, Any]): + # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Initialise actual dataset when load or save + # Add is_ds_init property + if ds_name in self._datasets: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self._datasets[ds_name] = AbstractDataset.from_config( + ds_name, + config, + self._load_versions.get(ds_name), + self._save_version, + ) - def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: - ds_config = self.resolve_patterns(ds_name) + def get_dataset( + self, ds_name: str, suggest: bool = True, version: Version | None = None + ) -> AbstractDataset: + dataset = self._datasets.get(ds_name, None) - if ds_config is None: + if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self._datasets.keys()) + matches = difflib.get_close_matches(ds_name, self._config.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) - elif ds_name not in self._datasets: - self._resolved_ds_configs[ds_name] = ds_config - self._init_dataset(ds_name, ds_config) - return self._datasets[ds_name] + if version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=version) - @abc.abstractmethod - def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the 'add_from_dict' method" - ) + return dataset - def add(self, ds_name: str, dataset: Any, **kwargs) -> None: - """Adds a new dataset object to the ``AbstractDataCatalog``.""" + def add( + self, ds_name: str, dataset: AbstractDataset, replace: bool = False + ) -> None: + """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" if ds_name in self._datasets: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) + if replace: + self._logger.warning("Replacing dataset '%s'", ds_name) + else: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) self._datasets[ds_name] = dataset - self._resolved_ds_configs[ds_name] = {} + self._config[ds_name] = {} @property def _logger(self) -> logging.Logger: @@ -398,13 +184,6 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] - @abc.abstractmethod - def load(self, name: str, **kwargs) -> Any: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the 'load' method" - ) - def save(self, name: str, data: Any) -> None: """Save data to a registered data set. @@ -448,37 +227,38 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def release(self, name: str) -> None: - pass + """Release any cached data associated with a data set - def confirm(self, name: str) -> None: - pass + Args: + name: A data set to be checked. - def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: - self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} - self._runtime_patterns = self._sort_patterns(self._runtime_patterns) + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + """ + dataset = self.get_dataset(name) + dataset.release() + def confirm(self, name: str) -> None: + """Confirm a dataset by its name. -class KedroDataCatalog(AbstractDataCatalog): - def __init__( # noqa: PLR0913 - self, - datasets: dict[str, AbstractDataset] | None = None, - config: dict[str, dict[str, Any]] | None = None, - credentials: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> None: - self._load_versions = load_versions or {} - self._save_version = save_version + Args: + name: Name of the dataset. + Raises: + DatasetError: When the dataset does not have `confirm` method. - super().__init__(datasets, config, credentials) + """ + self._logger.info("Confirming dataset '%s'", name) + dataset = self.get_dataset(name) - self._validate_missing_keys() + if hasattr(dataset, "confirm"): + dataset.confirm() + else: + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def _validate_missing_keys(self) -> None: missing_keys = [ - key - for key in self._load_versions.keys() - if not (key in self._config or self.match_pattern(key)) + key for key in self._load_versions.keys() if key not in self._config ] if missing_keys: raise DatasetNotFoundError( @@ -486,52 +266,6 @@ def _validate_missing_keys(self) -> None: f"are not found in the catalog." ) - def _init_dataset(self, ds_name: str, config: dict[str, Any]): - # Add LazyAbstractDataset to store the configuration but not to init actual dataset - # Initialise actual dataset when load or save - # Add is_ds_init property - self._datasets[ds_name] = AbstractDataset.from_config( - ds_name, - config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def get_dataset( - self, ds_name: str, suggest: bool = True, version: Version | None = None - ) -> AbstractDataset: - dataset = super().get_dataset(ds_name, suggest) - - if version and isinstance(dataset, AbstractVersionedDataset): - # we only want to return a similar-looking dataset, - # not modify the one stored in the current catalog - dataset = dataset._copy(_version=version) - - return dataset - - def add( - self, ds_name: str, dataset: AbstractDataset, replace: bool = False - ) -> None: - """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset - self._resolved_ds_configs[ds_name] = {} - - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name in datasets: - if isinstance(datasets[ds_name], AbstractDataset): - dataset = datasets[ds_name] - else: - dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - - self.add(ds_name, dataset, replace) - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. @@ -576,32 +310,11 @@ def load(self, name: str, version: str | None = None) -> Any: return result - def release(self, name: str) -> None: - """Release any cached data associated with a data set - - Args: - name: A data set to be checked. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - """ - dataset = self.get_dataset(name) - dataset.release() - - def confirm(self, name: str) -> None: - """Confirm a dataset by its name. - - Args: - name: Name of the dataset. - Raises: - DatasetError: When the dataset does not have `confirm` method. - - """ - self._logger.info("Confirming dataset '%s'", name) - dataset = self.get_dataset(name) + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + for ds_name in datasets: + if isinstance(datasets[ds_name], AbstractDataset): + dataset = datasets[ds_name] + else: + dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - if hasattr(dataset, "confirm"): - dataset.confirm() - else: - raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") + self.add(ds_name, dataset, replace) From 9346f081c4a8f898499b7bb140375698a979059f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 22 Aug 2024 19:02:13 +0100 Subject: [PATCH 022/173] KedroDataCatalog updates Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index cad4950c56..59efe7c777 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -52,6 +52,7 @@ def __init__( self._runtime_patterns = {} self._load_versions = load_versions or {} self._save_version = save_version + self._use_rich_markup = _has_rich_handler() if datasets: for ds_name in datasets: @@ -59,7 +60,8 @@ def __init__( self._config[ds_name] = {} if config: - self._init_datasets(config) + for ds_name, ds_config in config.items(): + self.init_dataset(ds_name, ds_config) self._validate_missing_keys() @@ -84,7 +86,7 @@ def config(self, value: Any): def __iter__(self): yield from self._datasets.values() - def __getitem__(self, ds_name: str) -> Any: + def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get_dataset(ds_name) def __contains__(self, ds_name: str) -> bool: @@ -96,25 +98,19 @@ def __contains__(self, ds_name: str) -> bool: def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def _init_datasets( - self, - config: dict[str, dict[str, Any]] | None, - ) -> None: - for ds_name, ds_config in config.items(): - validate_dataset_config(ds_name, ds_config) - self._init_dataset(ds_name, ds_config) - - def _init_dataset(self, ds_name: str, config: dict[str, Any]): + def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): # Add LazyAbstractDataset to store the configuration but not to init actual dataset # Initialise actual dataset when load or save # Add is_ds_init property + validate_dataset_config(ds_name, ds_config) if ds_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{ds_name}' has already been registered" ) + self._config[ds_name] = ds_config self._datasets[ds_name] = AbstractDataset.from_config( ds_name, - config, + ds_config, self._load_versions.get(ds_name), self._save_version, ) @@ -217,9 +213,7 @@ def save(self, name: str, data: Any) -> None: self._logger.info( "Saving data to %s (%s)...", - _format_rich(name, "dark_orange") - if _has_rich_handler(self._logger) - else name, + _format_rich(name, "dark_orange") if self._use_rich_markup else name, type(dataset).__name__, extra={"markup": True}, ) @@ -299,9 +293,7 @@ def load(self, name: str, version: str | None = None) -> Any: self._logger.info( "Loading data from %s (%s)...", - _format_rich(name, "dark_orange") - if _has_rich_handler(self._logger) - else name, + _format_rich(name, "dark_orange") if self._use_rich_markup else name, type(dataset).__name__, extra={"markup": True}, ) From 9568e29d12f70732376115d3e653e70bdac0eabd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 28 Aug 2024 19:07:55 +0100 Subject: [PATCH 023/173] Added property to return config Signed-off-by: Elena Khaustova --- kedro/config/config_resolver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py index 31fec7a339..9b870db21f 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/config/config_resolver.py @@ -98,6 +98,10 @@ def __init__( self._ds_configs = self._get_ds_configs(config, credentials) + @property + def config(self): + return copy.deepcopy(self._ds_configs) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From 5e27660678d4301e9e426a951ca4969b65f8c708 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 28 Aug 2024 19:12:05 +0100 Subject: [PATCH 024/173] Added list patterns method Signed-off-by: Elena Khaustova --- kedro/config/config_resolver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py index 9b870db21f..19c246e6db 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/config/config_resolver.py @@ -145,11 +145,15 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} - def match_pattern(self, ds_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" + def list_patterns(self) -> list[str]: all_patterns = list(self._dataset_patterns.keys()) all_patterns.extend(list(self._default_pattern.keys())) all_patterns.extend(list(self._runtime_patterns.keys())) + return all_patterns + + def match_pattern(self, ds_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + all_patterns = self.list_patterns() matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) From 72b11d00124d5c7b455441cff48929a2535e954c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 10:35:16 +0100 Subject: [PATCH 025/173] Renamed and moved ConfigResolver Signed-off-by: Elena Khaustova --- kedro/config/__init__.py | 2 -- kedro/io/__init__.py | 4 +++- .../config_resolver.py => io/catalog_config_resolver.py} | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename kedro/{config/config_resolver.py => io/catalog_config_resolver.py} (99%) diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index 9b47743bfe..500cd62615 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -7,13 +7,11 @@ BadConfigException, MissingConfigException, ) -from .config_resolver import ConfigResolver from .omegaconf_config import OmegaConfigLoader __all__ = [ "AbstractConfigLoader", "BadConfigException", - "ConfigResolver", "MissingConfigException", "OmegaConfigLoader", ] diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index df7880557e..5f51dc8a3b 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,6 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset +from .catalog_config_resolver import CatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -23,13 +24,14 @@ "AbstractDataset", "AbstractVersionedDataset", "CachedDataset", + "CatalogConfigResolver", "DataCatalog", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", + "KedroDataCatalog", "LambdaDataset", "MemoryDataset", "SharedMemoryDataset", "Version", - "KedroDataCatalog", ] diff --git a/kedro/config/config_resolver.py b/kedro/io/catalog_config_resolver.py similarity index 99% rename from kedro/config/config_resolver.py rename to kedro/io/catalog_config_resolver.py index 19c246e6db..88437b2532 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -85,7 +85,7 @@ def _resolve_config( return config -class ConfigResolver: +class CatalogConfigResolver: def __init__( self, config: dict[str, dict[str, Any]], From f0a409042ec63c3fd8e88a0906ea59f13416c580 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 11:32:43 +0100 Subject: [PATCH 026/173] Renamed ConfigResolver Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 4 ++-- kedro/io/catalog_config_resolver.py | 2 +- kedro/io/data_catalog_redesign.py | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 5f51dc8a3b..db3c295449 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset -from .catalog_config_resolver import CatalogConfigResolver +from .catalog_config_resolver import DataCatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -24,8 +24,8 @@ "AbstractDataset", "AbstractVersionedDataset", "CachedDataset", - "CatalogConfigResolver", "DataCatalog", + "DataCatalogConfigResolver", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 88437b2532..2238165037 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -85,7 +85,7 @@ def _resolve_config( return config -class CatalogConfigResolver: +class DataCatalogConfigResolver: def __init__( self, config: dict[str, dict[str, Any]], diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 59efe7c777..13c57adbba 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -99,9 +99,9 @@ def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): - # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save - # Add is_ds_init property + # Add is_init property validate_dataset_config(ds_name, ds_config) if ds_name in self._datasets: raise DatasetAlreadyExistsError( @@ -151,6 +151,7 @@ def add( f"Dataset '{ds_name}' has already been registered" ) self._datasets[ds_name] = dataset + # TODO: API to get configuration from dataset self._config[ds_name] = {} @property From 7d6227f3d119cbfaf44897bc949fa7552d0e12c3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 12:17:22 +0100 Subject: [PATCH 027/173] Cleaned KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 124 +++++++----------------------- 1 file changed, 26 insertions(+), 98 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 13c57adbba..9618198291 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -17,19 +17,9 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -Patterns = dict[str, dict[str, Any]] - CREDENTIALS_KEY = "credentials" -class DatasetConfigurationNotFoundError(DatasetError): - """``DatasetConfigurationNotFoundError`` raised by ``DataCatalog`` class in case of - trying to get non-existing dataset configuration. - """ - - pass - - def validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): raise DatasetError( @@ -49,39 +39,38 @@ def __init__( ) -> None: self._config = config or {} self._datasets = datasets or {} - self._runtime_patterns = {} self._load_versions = load_versions or {} self._save_version = save_version self._use_rich_markup = _has_rich_handler() - if datasets: - for ds_name in datasets: - # TODO: API to get configuration from dataset - self._config[ds_name] = {} + for ds_name in self._datasets: + # TODO: API to get configuration from dataset + self._config[ds_name] = {} - if config: - for ds_name, ds_config in config.items(): - self.init_dataset(ds_name, ds_config) + for ds_name, ds_config in self._config.items(): + self.init_dataset(ds_name, ds_config) self._validate_missing_keys() @property - def datasets(self): + def datasets(self) -> dict[str, Any]: return copy.deepcopy(self._datasets) @datasets.setter def datasets(self, value: Any): - msg = "Operation not allowed! Please change datasets through configuration." - raise AttributeError(msg) + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) @property def config(self): return copy.deepcopy(self._config) @config.setter - def config(self, value: Any): - msg = "Operation not allowed! Please change datasets through configuration." - raise AttributeError(msg) + def config(self, value: Any) -> dict[str, dict[str, Any]]: + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) def __iter__(self): yield from self._datasets.values() @@ -91,14 +80,12 @@ def __getitem__(self, ds_name: str) -> AbstractDataset: def __contains__(self, ds_name: str) -> bool: """Check if an item is in the catalog""" - if ds_name in self._datasets: - return True - return False + return ds_name in self._datasets def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): + def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save # Add is_init property @@ -122,11 +109,10 @@ def get_dataset( if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" - # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self._config.keys()) + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" @@ -174,7 +160,6 @@ def list(self, regex_search: str | None = None) -> list[str]: try: pattern = re.compile(regex_search, flags=re.IGNORECASE) - except re.error as exc: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" @@ -182,34 +167,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: - """Save data to a registered data set. - - Args: - name: A data set to be saved to. - data: A data object to be saved as configured in the registered - data set. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - - Example: - :: - - >>> import pandas as pd - >>> - >>> from kedro_datasets.pandas import CSVDataset - >>> - >>> cars = CSVDataset(filepath="cars.csv", - >>> load_args=None, - >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) - >>> - >>> df = pd.DataFrame({'col1': [1, 2], - >>> 'col2': [4, 5], - >>> 'col3': [5, 6]}) - >>> catalog.save("cars", df) - """ + """Save data to a registered data set.""" dataset = self.get_dataset(name) self._logger.info( @@ -252,9 +210,7 @@ def confirm(self, name: str) -> None: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def _validate_missing_keys(self) -> None: - missing_keys = [ - key for key in self._load_versions.keys() if key not in self._config - ] + missing_keys = [key for key in self._load_versions if key not in self._config] if missing_keys: raise DatasetNotFoundError( f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " @@ -262,33 +218,7 @@ def _validate_missing_keys(self) -> None: ) def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set. - - Args: - name: A data set to be loaded. - version: Optional argument for concrete data version to be loaded. - Works only with versioned datasets. - - Returns: - The loaded data as configured. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - - Example: - :: - - >>> from kedro.io import DataCatalog - >>> from kedro_datasets.pandas import CSVDataset - >>> - >>> cars = CSVDataset(filepath="cars.csv", - >>> load_args=None, - >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) - >>> - >>> df = catalog.load("cars") - """ + """Loads a registered data set.""" load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) @@ -299,15 +229,13 @@ def load(self, name: str, version: str | None = None) -> Any: extra={"markup": True}, ) - result = dataset.load() - - return result + return dataset.load() def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name in datasets: - if isinstance(datasets[ds_name], AbstractDataset): - dataset = datasets[ds_name] - else: - dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - + for ds_name, ds_data in datasets.items(): + dataset = ( + ds_data + if isinstance(ds_data, AbstractDataset) + else MemoryDataset(data=ds_data) + ) # type: ignore[abstract] self.add(ds_name, dataset, replace) From 409229164580989fef619c56ffbb9c7c9b803310 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 14:35:13 +0100 Subject: [PATCH 028/173] Cleaned up DataCatalogConfigResolver Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 110 ++++++++++++++-------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 2238165037..921141fb61 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -10,8 +10,8 @@ CREDENTIALS_KEY = "credentials" -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. +def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Fetch the specified credentials from the provided credentials dictionary. Args: credentials_name: Credentials name. @@ -51,56 +51,57 @@ def _resolve_credentials( """ config = copy.deepcopy(config) - def _map_value(key: str, value: Any) -> Any: + def _resolve_value(key: str, value: Any) -> Any: if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) + return _fetch_credentials(value, credentials) if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} + return {k: _resolve_value(k, v) for k, v in value.items()} return value - return {k: _map_value(k, v) for k, v in config.items()} + return {k: _resolve_value(k, v) for k, v in config.items()} -def _resolve_config( +def _resolve_dataset_config( ds_name: str, - matched_pattern: str, + pattern: str, config: dict, ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, ds_name) + """Resolve dataset configuration based on the provided pattern.""" + resolved_vars = parse(pattern, ds_name) # Resolve the factory config for the dataset if isinstance(config, dict): for key, value in config.items(): - config[key] = _resolve_config(ds_name, matched_pattern, value) + config[key] = _resolve_dataset_config(ds_name, pattern, value) elif isinstance(config, (list, tuple)): - config = [_resolve_config(ds_name, matched_pattern, value) for value in config] + config = [_resolve_dataset_config(ds_name, pattern, value) for value in config] elif isinstance(config, str) and "}" in config: try: - config = str(config).format_map(result.named) + config = config.format_map(resolved_vars.named) except KeyError as exc: raise KeyError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " f"should be present in the dataset factory pattern." ) from exc return config class DataCatalogConfigResolver: + """Resolves dataset configurations based on patterns and credentials.""" + def __init__( self, config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None = None, ): - self._runtime_patterns = {} - self._dataset_patterns, self._default_pattern = self._get_patterns( + self._runtime_patterns: Patterns = {} + self._dataset_patterns, self._default_pattern = self._extract_patterns( config, credentials ) - - self._ds_configs = self._get_ds_configs(config, credentials) + self._resolved_configs = self._init_configs(config, credentials) @property - def config(self): - return copy.deepcopy(self._ds_configs) + def config(self) -> dict[str, dict[str, Any]]: + return copy.deepcopy(self._resolved_configs) @property def _logger(self) -> logging.Logger: @@ -112,18 +113,17 @@ def _is_pattern(pattern: str) -> bool: return "{" in pattern @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets.""" + def _pattern_specificity(pattern: str) -> int: + """Calculate the specificity of a pattern based on characters outside curly brackets.""" # Remove all the placeholders from the pattern and count the number of remaining chars result = re.sub(r"\{.*?\}", "", pattern) return len(result) @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: """Sort a dictionary of dataset patterns according to parsing rules. In order: - 1. Decreasing specificity (number of characters outside the curly brackets) 2. Decreasing number of placeholders (number of curly bracket pairs) 3. Alphabetically @@ -131,13 +131,13 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] sorted_keys = sorted( dataset_patterns, key=lambda pattern: ( - -(cls._specificity(pattern)), + -(cls._pattern_specificity(pattern)), -pattern.count("{"), pattern, ), ) catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + pattern for pattern in sorted_keys if cls._pattern_specificity(pattern) == 0 ] if len(catch_all) > 1: raise ValueError( @@ -146,10 +146,12 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] return {key: dataset_patterns[key] for key in sorted_keys} def list_patterns(self) -> list[str]: - all_patterns = list(self._dataset_patterns.keys()) - all_patterns.extend(list(self._default_pattern.keys())) - all_patterns.extend(list(self._runtime_patterns.keys())) - return all_patterns + """List al patterns available in the catalog.""" + return ( + list(self._dataset_patterns.keys()) + + list(self._default_pattern.keys()) + + list(self._runtime_patterns.keys()) + ) def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" @@ -158,57 +160,57 @@ def match_pattern(self, ds_name: str) -> str | None: return next(matches, None) @classmethod - def _get_patterns( + def _extract_patterns( cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> tuple[Patterns, Patterns]: - dataset_patterns = {} + """Extract and sort patterns from the configuration.""" config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} + dataset_patterns = {} user_default = {} for ds_name, ds_config in config.items(): if cls._is_pattern(ds_name): - resolved_ds_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_ds_config + resolved_config = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = resolved_config sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + if cls._pattern_specificity(list(sorted_patterns.keys())[-1]) == 0: last_pattern = sorted_patterns.popitem() user_default = {last_pattern[0]: last_pattern[1]} return sorted_patterns, user_default - def _get_ds_configs( + def _init_configs( self, config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: + """Initialize the dataset configuration with resolved credentials.""" config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} - ds_configs = {} + resolved_configs = {} + for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): - ds_configs[ds_name] = _resolve_credentials(ds_config, credentials) + resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) - return ds_configs + return resolved_configs - def resolve_patterns( + def resolve_dataset_patterns( self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: - if isinstance(datasets, str): - datasets_lst = [datasets] - else: - datasets_lst = datasets - + """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" + datasets_lst = [datasets] if isinstance(datasets, str) else datasets resolved_configs = [] for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._ds_configs: + if matched_pattern and ds_name not in self._resolved_configs: # If the dataset is a patterned dataset, materialise it and add it to # the catalog config_copy = copy.deepcopy( @@ -217,10 +219,12 @@ def resolve_patterns( or self._runtime_patterns.get(matched_pattern) or {} ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + ds_config = _resolve_dataset_config( + ds_name, matched_pattern, config_copy + ) if ( - self._specificity(matched_pattern) == 0 + self._pattern_specificity(matched_pattern) == 0 and matched_pattern in self._default_pattern ): self._logger.warning( @@ -230,16 +234,14 @@ def resolve_patterns( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self._ds_configs: - resolved_configs.append(self._ds_configs.get(ds_name)) + elif ds_name in self._resolved_configs: + resolved_configs.append(self._resolved_configs.get(ds_name)) else: resolved_configs.append(None) - if isinstance(datasets, str): - return resolved_configs[0] - else: - return resolved_configs + return resolved_configs[0] if isinstance(datasets, str) else resolved_configs def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + """Add new runtime patterns and re-sort them.""" self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} self._runtime_patterns = self._sort_patterns(self._runtime_patterns) From 63e47f98ecddea0305a02c823c317d9300a388a8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 30 Aug 2024 12:23:10 +0100 Subject: [PATCH 029/173] Docs build fix attempt Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 921141fb61..1c7a39cfa9 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import logging import re From 85bf720cf129cfce57e0b3baf5af57c8393af652 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 12:07:35 +0100 Subject: [PATCH 030/173] KedroDataCatalog draft Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 2 + kedro/io/data_catalog_redesign.py | 237 ++++++++++++++++++++++++++++++ 2 files changed, 239 insertions(+) create mode 100644 kedro/io/data_catalog_redesign.py diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index aba59827e9..5f3d025fe5 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -14,6 +14,7 @@ Version, ) from .data_catalog import DataCatalog +from .data_catalog_redesign import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -26,6 +27,7 @@ "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", + "KedroDataCatalog", "LambdaDataset", "MemoryDataset", "SharedMemoryDataset", diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py new file mode 100644 index 0000000000..590a187946 --- /dev/null +++ b/kedro/io/data_catalog_redesign.py @@ -0,0 +1,237 @@ +from __future__ import annotations + +import copy +import difflib +import logging +import re +from typing import Any + +from kedro.io.core import ( + AbstractDataset, + AbstractVersionedDataset, + DatasetAlreadyExistsError, + DatasetError, + DatasetNotFoundError, + Version, +) +from kedro.io.memory_dataset import MemoryDataset +from kedro.utils import _format_rich, _has_rich_handler + +CREDENTIALS_KEY = "credentials" + + +def validate_dataset_config(ds_name: str, ds_config: Any) -> None: + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + +class KedroDataCatalog: + def __init__( + self, + datasets: dict[str, Any] | None = None, + config: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> None: + self._config = config or {} + self._datasets = datasets or {} + self._load_versions = load_versions or {} + self._save_version = save_version + self._use_rich_markup = _has_rich_handler() + + for ds_name in self._datasets: + # TODO: API to get configuration from dataset + self._config[ds_name] = {} + + for ds_name, ds_config in self._config.items(): + self.init_dataset(ds_name, ds_config) + + self._validate_missing_keys() + + @property + def datasets(self) -> dict[str, Any]: + return copy.deepcopy(self._datasets) + + @datasets.setter + def datasets(self, value: Any): + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) + + @property + def config(self): + return copy.deepcopy(self._config) + + @config.setter + def config(self, value: Any) -> dict[str, dict[str, Any]]: + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) + + def __iter__(self): + yield from self._datasets.values() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self.get_dataset(ds_name) + + def __contains__(self, ds_name: str) -> bool: + """Check if an item is in the catalog""" + return ds_name in self._datasets + + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) + + def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: + # Add lazy loading feature to store the configuration but not to init actual dataset + # Initialise actual dataset when load or save + # Add is_init property + validate_dataset_config(ds_name, ds_config) + if ds_name in self._datasets: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self._config[ds_name] = ds_config + self._datasets[ds_name] = AbstractDataset.from_config( + ds_name, + ds_config, + self._load_versions.get(ds_name), + self._save_version, + ) + + def get_dataset( + self, ds_name: str, suggest: bool = True, version: Version | None = None + ) -> AbstractDataset: + dataset = self._datasets.get(ds_name, None) + + if dataset is None: + error_msg = f"Dataset '{ds_name}' not found in the catalog" + # Flag to turn on/off fuzzy-matching which can be time consuming and + # slow down plugins like `kedro-viz` + if suggest: + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) + if matches: + suggestions = ", ".join(matches) + error_msg += f" - did you mean one of these instead: {suggestions}" + raise DatasetNotFoundError(error_msg) + + if version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=version) + + return dataset + + def add( + self, ds_name: str, dataset: AbstractDataset, replace: bool = False + ) -> None: + """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" + if ds_name in self._datasets: + if replace: + self._logger.warning("Replacing dataset '%s'", ds_name) + else: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self._datasets[ds_name] = dataset + # TODO: API to get configuration from dataset + self._config[ds_name] = {} + + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + + def list(self, regex_search: str | None = None) -> list[str]: + """ + List of all dataset names registered in the catalog. + This can be filtered by providing an optional regular expression + which will only return matching keys. + """ + + if regex_search is None: + return list(self._datasets.keys()) + + if not regex_search.strip(): + self._logger.warning("The empty string will not match any data sets") + return [] + + try: + pattern = re.compile(regex_search, flags=re.IGNORECASE) + except re.error as exc: + raise SyntaxError( + f"Invalid regular expression provided: '{regex_search}'" + ) from exc + return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + + def save(self, name: str, data: Any) -> None: + """Save data to a registered data set.""" + dataset = self.get_dataset(name) + + self._logger.info( + "Saving data to %s (%s)...", + _format_rich(name, "dark_orange") if self._use_rich_markup else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + dataset.save(data) + + def release(self, name: str) -> None: + """Release any cached data associated with a data set + Args: + name: A data set to be checked. + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + """ + dataset = self.get_dataset(name) + dataset.release() + + def confirm(self, name: str) -> None: + """Confirm a dataset by its name. + Args: + name: Name of the dataset. + Raises: + DatasetError: When the dataset does not have `confirm` method. + """ + self._logger.info("Confirming dataset '%s'", name) + dataset = self.get_dataset(name) + + if hasattr(dataset, "confirm"): + dataset.confirm() + else: + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") + + def _validate_missing_keys(self) -> None: + missing_keys = [key for key in self._load_versions if key not in self._config] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) + + def load(self, name: str, version: str | None = None) -> Any: + """Loads a registered data set.""" + load_version = Version(version, None) if version else None + dataset = self.get_dataset(name, version=load_version) + + self._logger.info( + "Loading data from %s (%s)...", + _format_rich(name, "dark_orange") if self._use_rich_markup else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + return dataset.load() + + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + for ds_name, ds_data in datasets.items(): + dataset = ( + ds_data + if isinstance(ds_data, AbstractDataset) + else MemoryDataset(data=ds_data) + ) # type: ignore[abstract] + self.add(ds_name, dataset, replace) From 68f6527e4c2dc97a0feb48e4a76be21926291b27 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 12:16:06 +0100 Subject: [PATCH 031/173] Removed KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 2 - kedro/io/data_catalog_redesign.py | 241 ------------------------------ 2 files changed, 243 deletions(-) delete mode 100644 kedro/io/data_catalog_redesign.py diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index db3c295449..5d17d6f058 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -15,7 +15,6 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -29,7 +28,6 @@ "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", - "KedroDataCatalog", "LambdaDataset", "MemoryDataset", "SharedMemoryDataset", diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py deleted file mode 100644 index 9618198291..0000000000 --- a/kedro/io/data_catalog_redesign.py +++ /dev/null @@ -1,241 +0,0 @@ -from __future__ import annotations - -import copy -import difflib -import logging -import re -from typing import Any - -from kedro.io.core import ( - AbstractDataset, - AbstractVersionedDataset, - DatasetAlreadyExistsError, - DatasetError, - DatasetNotFoundError, - Version, -) -from kedro.io.memory_dataset import MemoryDataset -from kedro.utils import _format_rich, _has_rich_handler - -CREDENTIALS_KEY = "credentials" - - -def validate_dataset_config(ds_name: str, ds_config: Any) -> None: - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - - -class KedroDataCatalog: - def __init__( - self, - datasets: dict[str, Any] | None = None, - config: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> None: - self._config = config or {} - self._datasets = datasets or {} - self._load_versions = load_versions or {} - self._save_version = save_version - self._use_rich_markup = _has_rich_handler() - - for ds_name in self._datasets: - # TODO: API to get configuration from dataset - self._config[ds_name] = {} - - for ds_name, ds_config in self._config.items(): - self.init_dataset(ds_name, ds_config) - - self._validate_missing_keys() - - @property - def datasets(self) -> dict[str, Any]: - return copy.deepcopy(self._datasets) - - @datasets.setter - def datasets(self, value: Any): - raise AttributeError( - "Operation not allowed! Please change datasets through configuration." - ) - - @property - def config(self): - return copy.deepcopy(self._config) - - @config.setter - def config(self, value: Any) -> dict[str, dict[str, Any]]: - raise AttributeError( - "Operation not allowed! Please change datasets through configuration." - ) - - def __iter__(self): - yield from self._datasets.values() - - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self.get_dataset(ds_name) - - def __contains__(self, ds_name: str) -> bool: - """Check if an item is in the catalog""" - return ds_name in self._datasets - - def _ipython_key_completions_(self) -> list[str]: - return list(self._datasets.keys()) - - def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: - # Add lazy loading feature to store the configuration but not to init actual dataset - # Initialise actual dataset when load or save - # Add is_init property - validate_dataset_config(ds_name, ds_config) - if ds_name in self._datasets: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._config[ds_name] = ds_config - self._datasets[ds_name] = AbstractDataset.from_config( - ds_name, - ds_config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def get_dataset( - self, ds_name: str, suggest: bool = True, version: Version | None = None - ) -> AbstractDataset: - dataset = self._datasets.get(ds_name, None) - - if dataset is None: - error_msg = f"Dataset '{ds_name}' not found in the catalog" - # Flag to turn on/off fuzzy-matching which can be time consuming and - # slow down plugins like `kedro-viz` - if suggest: - matches = difflib.get_close_matches(ds_name, self._datasets.keys()) - if matches: - suggestions = ", ".join(matches) - error_msg += f" - did you mean one of these instead: {suggestions}" - raise DatasetNotFoundError(error_msg) - - if version and isinstance(dataset, AbstractVersionedDataset): - # we only want to return a similar-looking dataset, - # not modify the one stored in the current catalog - dataset = dataset._copy(_version=version) - - return dataset - - def add( - self, ds_name: str, dataset: AbstractDataset, replace: bool = False - ) -> None: - """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset - # TODO: API to get configuration from dataset - self._config[ds_name] = {} - - @property - def _logger(self) -> logging.Logger: - return logging.getLogger(__name__) - - def list(self, regex_search: str | None = None) -> list[str]: - """ - List of all dataset names registered in the catalog. - This can be filtered by providing an optional regular expression - which will only return matching keys. - """ - - if regex_search is None: - return list(self._datasets.keys()) - - if not regex_search.strip(): - self._logger.warning("The empty string will not match any data sets") - return [] - - try: - pattern = re.compile(regex_search, flags=re.IGNORECASE) - except re.error as exc: - raise SyntaxError( - f"Invalid regular expression provided: '{regex_search}'" - ) from exc - return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] - - def save(self, name: str, data: Any) -> None: - """Save data to a registered data set.""" - dataset = self.get_dataset(name) - - self._logger.info( - "Saving data to %s (%s)...", - _format_rich(name, "dark_orange") if self._use_rich_markup else name, - type(dataset).__name__, - extra={"markup": True}, - ) - - dataset.save(data) - - def release(self, name: str) -> None: - """Release any cached data associated with a data set - - Args: - name: A data set to be checked. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - """ - dataset = self.get_dataset(name) - dataset.release() - - def confirm(self, name: str) -> None: - """Confirm a dataset by its name. - - Args: - name: Name of the dataset. - Raises: - DatasetError: When the dataset does not have `confirm` method. - - """ - self._logger.info("Confirming dataset '%s'", name) - dataset = self.get_dataset(name) - - if hasattr(dataset, "confirm"): - dataset.confirm() - else: - raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - - def _validate_missing_keys(self) -> None: - missing_keys = [key for key in self._load_versions if key not in self._config] - if missing_keys: - raise DatasetNotFoundError( - f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " - f"are not found in the catalog." - ) - - def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set.""" - load_version = Version(version, None) if version else None - dataset = self.get_dataset(name, version=load_version) - - self._logger.info( - "Loading data from %s (%s)...", - _format_rich(name, "dark_orange") if self._use_rich_markup else name, - type(dataset).__name__, - extra={"markup": True}, - ) - - return dataset.load() - - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name, ds_data in datasets.items(): - dataset = ( - ds_data - if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) - ) # type: ignore[abstract] - self.add(ds_name, dataset, replace) From 2ac4a2f211ec52c77430bb770d22fa43c345b9c9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 15:21:48 +0100 Subject: [PATCH 032/173] Updated from_config method Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 16 +++++-- kedro/io/data_catalog.py | 69 +++++++++++++---------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 1c7a39cfa9..aaa102eddd 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -92,7 +92,7 @@ class DataCatalogConfigResolver: def __init__( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ): self._runtime_patterns: Patterns = {} @@ -103,14 +103,22 @@ def __init__( @property def config(self) -> dict[str, dict[str, Any]]: - return copy.deepcopy(self._resolved_configs) + return self._resolved_configs + + @property + def dataset_patterns(self) -> Patterns: + return self._dataset_patterns + + @property + def default_pattern(self) -> Patterns: + return self._default_pattern @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @staticmethod - def _is_pattern(pattern: str) -> bool: + def is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern @@ -189,7 +197,7 @@ def _extract_patterns( def _init_configs( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index d3fd163230..6be3d2905e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -16,6 +16,7 @@ from parse import parse +from kedro.io import DataCatalogConfigResolver from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -162,9 +163,10 @@ def __init__( # noqa: PLR0913 datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, dataset_patterns: Patterns | None = None, + default_pattern: Patterns | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, - default_pattern: Patterns | None = None, + config_resolver: DataCatalogConfigResolver = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -195,6 +197,8 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. + config_resolver: + Example: :: @@ -206,14 +210,12 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ + self._config_resolver = config_resolver or DataCatalogConfigResolver() self._datasets = dict(datasets or {}) + self._datasets_config = self._config_resolver.config self.datasets = _FrozenDatasets(self._datasets) - # Keep a record of all patterns in the catalog. - # {dataset pattern name : dataset pattern body} - self._dataset_patterns = dataset_patterns or {} self._load_versions = load_versions or {} self._save_version = save_version - self._default_pattern = default_pattern or {} self._use_rich_markup = _has_rich_handler() if feed_dict: @@ -304,43 +306,26 @@ class to be loaded is specified with the key ``type`` and their >>> catalog.save("boats", df) """ datasets = {} - dataset_patterns = {} - catalog = copy.deepcopy(catalog) or {} - credentials = copy.deepcopy(credentials) or {} + config_resolver = DataCatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} - user_default = {} - - for ds_name, ds_config in catalog.items(): - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) - if cls._is_pattern(ds_name): - # Add each factory to the dataset_patterns dict. - dataset_patterns[ds_name] = ds_config - - else: + for ds_name in catalog: + if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( - ds_name, ds_config, load_versions.get(ds_name), save_version + ds_name, + config_resolver.config[ds_name], + load_versions.get(ds_name), + save_version, ) - sorted_patterns = cls._sort_patterns(dataset_patterns) - if sorted_patterns: - # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: - last_pattern = sorted_patterns.popitem() - user_default = {last_pattern[0]: last_pattern[1]} missing_keys = [ - key - for key in load_versions.keys() - if not (key in catalog or cls._match_pattern(sorted_patterns, key)) + ds_name + for ds_name in load_versions + if not ( + ds_name in config_resolver.config + or config_resolver.match_pattern(ds_name) + ) ] if missing_keys: raise DatasetNotFoundError( @@ -350,10 +335,11 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, - dataset_patterns=sorted_patterns, + dataset_patterns=config_resolver.dataset_patterns, + default_pattern=config_resolver.default_pattern, load_versions=load_versions, save_version=save_version, - default_pattern=user_default, + config_resolver=config_resolver, ) @staticmethod @@ -619,7 +605,11 @@ def release(self, name: str) -> None: dataset.release() def add( - self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + self, + dataset_name: str, + dataset: AbstractDataset, + dataset_config: dict[str, Any] | None = None, + replace: bool = False, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -628,6 +618,7 @@ def add( registered yet. dataset: A data set object to be associated with the given data set name. + dataset_config: A dictionary with dataset configuration. replace: Specifies whether to replace an existing dataset with the same name is allowed. @@ -654,6 +645,8 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset + if dataset_config is not None: + self._datasets_config[dataset_name] = dataset_config self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( From cb5879d5067e6a9020ae0b3057a550ab5784d430 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 15:48:59 +0100 Subject: [PATCH 033/173] Updated constructor and add methods Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 6be3d2905e..a78a7f2177 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -197,7 +197,7 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. - config_resolver: + config_resolver: An instance of DataCatalogConfigResolver to resolve dataset patterns and configurations. Example: @@ -211,11 +211,15 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets = dict(datasets or {}) self._datasets_config = self._config_resolver.config - self.datasets = _FrozenDatasets(self._datasets) + self._datasets = {} + self.datasets = {} + + self.add_all(dict(datasets) or {}, datasets_configs=self._datasets_config) + self._load_versions = load_versions or {} self._save_version = save_version + self._use_rich_markup = _has_rich_handler() if feed_dict: @@ -608,8 +612,8 @@ def add( self, dataset_name: str, dataset: AbstractDataset, - dataset_config: dict[str, Any] | None = None, replace: bool = False, + dataset_config: dict[str, Any] | None = None, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -618,9 +622,10 @@ def add( registered yet. dataset: A data set object to be associated with the given data set name. - dataset_config: A dictionary with dataset configuration. replace: Specifies whether to replace an existing dataset with the same name is allowed. + dataset_config: A dictionary with dataset configuration. + Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -645,12 +650,16 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset - if dataset_config is not None: - self._datasets_config[dataset_name] = dataset_config + self._datasets_config[dataset_name] = ( + dataset_config if dataset_config is not None else {} + ) self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( - self, datasets: dict[str, AbstractDataset], replace: bool = False + self, + datasets: dict[str, AbstractDataset], + replace: bool = False, + datasets_configs: dict[str, dict[str, Any]] | None = None, ) -> None: """Adds a group of new data sets to the ``DataCatalog``. @@ -659,6 +668,7 @@ def add_all( instances. replace: Specifies whether to replace an existing dataset with the same name is allowed. + datasets_configs: A dictionary of dataset configurations. Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -681,8 +691,8 @@ def add_all( >>> >>> assert catalog.list() == ["cars", "planes", "boats"] """ - for name, dataset in datasets.items(): - self.add(name, dataset, replace) + for ds_name, ds in datasets.items(): + self.add(ds_name, ds, replace, datasets_configs.get(ds_name, {})) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: """Add datasets to the ``DataCatalog`` using the data provided through the `feed_dict`. @@ -719,13 +729,13 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non >>> >>> assert catalog.load("data_csv_dataset").equals(df) """ - for dataset_name in feed_dict: - if isinstance(feed_dict[dataset_name], AbstractDataset): - dataset = feed_dict[dataset_name] - else: - dataset = MemoryDataset(data=feed_dict[dataset_name]) # type: ignore[abstract] - - self.add(dataset_name, dataset, replace) + for ds_name, ds_data in feed_dict.items(): + dataset = ( + ds_data + if isinstance(ds_data, AbstractDataset) + else MemoryDataset(data=ds_data) + ) # type: ignore[abstract] + self.add(ds_name, dataset, replace) def list(self, regex_search: str | None = None) -> list[str]: """ From 9038e963281333ebe10316d297bce0ef723388fd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:14:13 +0100 Subject: [PATCH 034/173] Updated _get_dataset method Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +- kedro/io/data_catalog.py | 174 ++-------------------------- 2 files changed, 11 insertions(+), 169 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index aaa102eddd..846ef6277f 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -182,7 +182,7 @@ def _extract_patterns( user_default = {} for ds_name, ds_config in config.items(): - if cls._is_pattern(ds_name): + if cls.is_pattern(ds_name): resolved_config = _resolve_credentials(ds_config, credentials) dataset_patterns[ds_name] = resolved_config @@ -206,12 +206,12 @@ def _init_configs( resolved_configs = {} for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): + if not self.is_pattern(ds_name): resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) return resolved_configs - def resolve_dataset_patterns( + def resolve_dataset_pattern( self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a78a7f2177..56e046da68 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -14,8 +14,6 @@ import re from typing import Any, Dict -from parse import parse - from kedro.io import DataCatalogConfigResolver from kedro.io.core import ( AbstractDataset, @@ -36,57 +34,6 @@ WORDS_REGEX_PATTERN = re.compile(r"\W+") -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://docs.kedro.org/en/stable/api/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _map_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) - if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} - return value - - return {k: _map_value(k, v) for k, v in config.items()} - - def _sub_nonword_chars(dataset_name: str) -> str: """Replace non-word characters in data set names since Kedro 0.16.2. @@ -346,101 +293,22 @@ class to be loaded is specified with the key ``type`` and their config_resolver=config_resolver, ) - @staticmethod - def _is_pattern(pattern: str) -> bool: - """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" - return "{" in pattern - - @staticmethod - def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" - matches = ( - pattern - for pattern in dataset_patterns.keys() - if parse(pattern, dataset_name) - ) - return next(matches, None) - - @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: - """Sort a dictionary of dataset patterns according to parsing rules. - - In order: - - 1. Decreasing specificity (number of characters outside the curly brackets) - 2. Decreasing number of placeholders (number of curly bracket pairs) - 3. Alphabetically - """ - sorted_keys = sorted( - dataset_patterns, - key=lambda pattern: ( - -(cls._specificity(pattern)), - -pattern.count("{"), - pattern, - ), - ) - catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 - ] - if len(catch_all) > 1: - raise DatasetError( - f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." - ) - return {key: dataset_patterns[key] for key in sorted_keys} - - @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets. - - Example: - :: - - >>> specificity("{namespace}.companies") = 10 - >>> specificity("{namespace}.{dataset}") = 1 - >>> specificity("france.companies") = 16 - """ - # Remove all the placeholders from the pattern and count the number of remaining chars - result = re.sub(r"\{.*?\}", "", pattern) - return len(result) - def _get_dataset( self, dataset_name: str, version: Version | None = None, suggest: bool = True, ) -> AbstractDataset: - matched_pattern = self._match_pattern( - self._dataset_patterns, dataset_name - ) or self._match_pattern(self._default_pattern, dataset_name) - if dataset_name not in self._datasets and matched_pattern: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or {} - ) - dataset_config = self._resolve_config( - dataset_name, matched_pattern, config_copy - ) - dataset = AbstractDataset.from_config( + ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) + + if ds_config is not None: + ds = AbstractDataset.from_config( dataset_name, - dataset_config, + ds_config, self._load_versions.get(dataset_name), self._save_version, ) - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - dataset_name, - ) - - self.add(dataset_name, dataset) + self.add(dataset_name, ds, dataset_config=ds_config) if dataset_name not in self._datasets: error_msg = f"Dataset '{dataset_name}' not found in the catalog" @@ -452,7 +320,9 @@ def _get_dataset( suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) + dataset = self._datasets[dataset_name] + if version and isinstance(dataset, AbstractVersionedDataset): # we only want to return a similar-looking dataset, # not modify the one stored in the current catalog @@ -467,34 +337,6 @@ def __contains__(self, dataset_name: str) -> bool: return True return False - @classmethod - def _resolve_config( - cls, - dataset_name: str, - matched_pattern: str, - config: dict, - ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = cls._resolve_config(dataset_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [ - cls._resolve_config(dataset_name, matched_pattern, value) - for value in config - ] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. From cc89565db750273b44be00e2d1b796859c017eaf Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:18:58 +0100 Subject: [PATCH 035/173] Updated __contains__ Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 56e046da68..2986b0b6a1 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -332,10 +332,9 @@ def _get_dataset( def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) - if dataset_name in self._datasets or matched_pattern: - return True - return False + return dataset_name in self._datasets or self._config_resolver.match_pattern( + dataset_name + ) def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. From 59b676468a36b69079af6f500a0b14b197b71065 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:31:07 +0100 Subject: [PATCH 036/173] Updated __eq__ and shallow_copy Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 2986b0b6a1..5d268e282e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -631,26 +631,21 @@ def shallow_copy( Returns: Copy of the current object. """ - if not self._default_pattern and extra_dataset_patterns: - unsorted_dataset_patterns = { - **self._dataset_patterns, - **extra_dataset_patterns, - } - dataset_patterns = self._sort_patterns(unsorted_dataset_patterns) - else: - dataset_patterns = self._dataset_patterns + if extra_dataset_patterns: + self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self.__class__( datasets=self._datasets, - dataset_patterns=dataset_patterns, + dataset_patterns=self._config_resolver.dataset_patterns, + default_pattern=self._config_resolver.default_pattern, load_versions=self._load_versions, save_version=self._save_version, - default_pattern=self._default_pattern, + config_resolver=self._config_resolver, ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._dataset_patterns) == ( + return (self._datasets, self._config_resolver.dataset_patterns) == ( other._datasets, - other._dataset_patterns, + other._config_resolver._dataset_patterns, ) def confirm(self, name: str) -> None: From 4f5a3fbb8d1f8f1da8c4000af80d77245c948a8c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:36:50 +0100 Subject: [PATCH 037/173] Added __iter__ and __getitem__ Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 5d268e282e..883d5da78e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -175,6 +175,24 @@ def __init__( # noqa: PLR0913 def __repr__(self) -> str: return self.datasets.__repr__() + def __iter__(self): + yield from self._datasets.values() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self._get_dataset(ds_name) + + def __contains__(self, dataset_name: str) -> bool: + """Check if an item is in the catalog as a materialised dataset or pattern""" + return dataset_name in self._datasets or self._config_resolver.match_pattern( + dataset_name + ) + + def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] + return (self._datasets, self._config_resolver.dataset_patterns) == ( + other._datasets, + other._config_resolver._dataset_patterns, + ) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -330,12 +348,6 @@ def _get_dataset( return dataset - def __contains__(self, dataset_name: str) -> bool: - """Check if an item is in the catalog as a materialised dataset or pattern""" - return dataset_name in self._datasets or self._config_resolver.match_pattern( - dataset_name - ) - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. @@ -621,7 +633,7 @@ def list(self, regex_search: str | None = None) -> list[str]: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [dset_name for dset_name in self._datasets if pattern.search(dset_name)] + return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def shallow_copy( self, extra_dataset_patterns: Patterns | None = None @@ -642,12 +654,6 @@ def shallow_copy( config_resolver=self._config_resolver, ) - def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._config_resolver.dataset_patterns) == ( - other._datasets, - other._config_resolver._dataset_patterns, - ) - def confirm(self, name: str) -> None: """Confirm a dataset by its name. From 12ed6f2e39a2e3737f905323be8cb2f7c87daef6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:39:24 +0100 Subject: [PATCH 038/173] Removed unused imports Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 883d5da78e..7218ccaff5 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -12,9 +12,9 @@ import logging import pprint import re -from typing import Any, Dict +from typing import Any -from kedro.io import DataCatalogConfigResolver +from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -27,10 +27,6 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -Patterns = Dict[str, Dict[str, Any]] - -CATALOG_KEY = "catalog" -CREDENTIALS_KEY = "credentials" WORDS_REGEX_PATTERN = re.compile(r"\W+") From a106cec92eda2ca74940d92b4ae872a86797f883 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:47:38 +0100 Subject: [PATCH 039/173] Added TODO Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 846ef6277f..1236fd397b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -66,7 +66,7 @@ def _resolve_value(key: str, value: Any) -> Any: def _resolve_dataset_config( ds_name: str, pattern: str, - config: dict, + config: Any, ) -> dict[str, Any]: """Resolve dataset configuration based on the provided pattern.""" resolved_vars = parse(pattern, ds_name) @@ -201,6 +201,7 @@ def _init_configs( credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" + # TODO: check if deep copies are required config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} resolved_configs = {} From 6df04f77ed69efcd5e1b3f3718dc4ce0aa1c92b6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:52:37 +0100 Subject: [PATCH 040/173] Updated runner.run() Signed-off-by: Elena Khaustova --- kedro/runner/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 2ffd0389e4..6f165e87c0 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -83,7 +83,6 @@ def run( """ hook_or_null_manager = hook_manager or _NullPluginManager() - catalog = catalog.shallow_copy() # Check which datasets used in the pipeline are in the catalog or match # a pattern in the catalog From 8566e279b28fc7ae966ec0584eedc6e096098b7b Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:59:00 +0100 Subject: [PATCH 041/173] Updated session Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 91928f7c4b..25a0f46896 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -397,10 +397,7 @@ def run( # noqa: PLR0913 try: if isinstance(runner, ThreadRunner): for ds in filtered_pipeline.datasets(): - if catalog._match_pattern( - catalog._dataset_patterns, ds - ) or catalog._match_pattern(catalog._default_pattern, ds): - _ = catalog._get_dataset(ds) + _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) From 2dcea33ad36c86e83072b11d0706d962d9406e8c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:20:10 +0100 Subject: [PATCH 042/173] Added confil_resolver property Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 3 ++- kedro/io/data_catalog.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 25a0f46896..2b13cd1694 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -397,7 +397,8 @@ def run( # noqa: PLR0913 try: if isinstance(runner, ThreadRunner): for ds in filtered_pipeline.datasets(): - _ = catalog._get_dataset(ds) + if catalog.config_resolver.match_pattern(ds): + _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7218ccaff5..abf9bf6353 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -189,6 +189,10 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other._config_resolver._dataset_patterns, ) + @property + def config_resolver(self): + return self._config_resolver + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -315,7 +319,7 @@ def _get_dataset( ) -> AbstractDataset: ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) - if ds_config is not None: + if dataset_name not in self._datasets and ds_config is not None: ds = AbstractDataset.from_config( dataset_name, ds_config, From a46597fec0637f1fd268087bf0d65969bc39b3d7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:31:02 +0100 Subject: [PATCH 043/173] Updated catalog list command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 223980dade..01170dbd7c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -28,6 +28,11 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession: return KedroSession.create(**kwargs) +def is_parameter(dataset_name: str) -> bool: + """Check if dataset is a parameter.""" + return dataset_name.startswith("params:") or dataset_name == "parameters" + + @click.group(name="Kedro") def catalog_cli() -> None: # pragma: no cover pass @@ -88,21 +93,15 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: # resolve any factory datasets in the pipeline factory_ds_by_type = defaultdict(list) - for ds_name in default_ds: - matched_pattern = data_catalog._match_pattern( - data_catalog._dataset_patterns, ds_name - ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name) - if matched_pattern: - ds_config_copy = copy.deepcopy( - data_catalog._dataset_patterns.get(matched_pattern) - or data_catalog._default_pattern.get(matched_pattern) - or {} - ) - ds_config = data_catalog._resolve_config( - ds_name, matched_pattern, ds_config_copy + resolved_configs = data_catalog.config_resolver.resolve_dataset_pattern( + default_ds + ) + for ds_name, ds_config in zip(default_ds, resolved_configs): + if data_catalog.config_resolver.match_pattern(ds_name): + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( + ds_name ) - factory_ds_by_type[ds_config["type"]].append(ds_name) default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values())) @@ -128,12 +127,11 @@ def _map_type_to_datasets( datasets of the specific type as a value. """ mapping = defaultdict(list) # type: ignore[var-annotated] - for dataset in datasets: - is_param = dataset.startswith("params:") or dataset == "parameters" - if not is_param: - ds_type = datasets_meta[dataset].__class__.__name__ - if dataset not in mapping[ds_type]: - mapping[ds_type].append(dataset) + for dataset_name in datasets: + if not is_parameter(dataset_name): + ds_type = datasets_meta[dataset_name].__class__.__name__ + if dataset_name not in mapping[ds_type]: + mapping[ds_type].append(dataset_name) return mapping From 3787545f8b6afb2e5d83e9a1ae7d3b7db48337ab Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:38:03 +0100 Subject: [PATCH 044/173] Updated catalog create command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 01170dbd7c..cebbdb9b28 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -168,20 +168,16 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name: str, env: str) -> N f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}" ) - pipe_datasets = { - ds_name - for ds_name in pipeline.datasets() - if not ds_name.startswith("params:") and ds_name != "parameters" + pipeline_datasets = { + ds_name for ds_name in pipeline.datasets() if not is_parameter(ds_name) } catalog_datasets = { - ds_name - for ds_name in context.catalog._datasets.keys() - if not ds_name.startswith("params:") and ds_name != "parameters" + ds_name for ds_name in context.catalog.list() if not is_parameter(ds_name) } # Datasets that are missing in Data Catalog - missing_ds = sorted(pipe_datasets - catalog_datasets) + missing_ds = sorted(pipeline_datasets - catalog_datasets) if missing_ds: catalog_path = ( context.project_path From 68d612d34c2276d3d8d00a24207ffeb38d34da99 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:41:04 +0100 Subject: [PATCH 045/173] Updated catalog rank command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index cebbdb9b28..c4b701f56e 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -215,12 +215,14 @@ def rank_catalog_factories(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - catalog_factories = { - **context.catalog._dataset_patterns, - **context.catalog._default_pattern, - } + catalog_factories = list( + { + **context.catalog.config_resolver.dataset_patterns, + **context.catalog.config_resolver.default_pattern, + }.keys() + ) if catalog_factories: - click.echo(yaml.dump(list(catalog_factories.keys()))) + click.echo(yaml.dump(catalog_factories)) else: click.echo("There are no dataset factories in the catalog.") From af5bee9df3e10f79a7ee797d4746b7dbd02ed415 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:54:29 +0100 Subject: [PATCH 046/173] Updated catalog resolve command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 29 +++++++++-------------------- kedro/io/data_catalog.py | 6 +++++- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index c4b701f56e..297e0dcb9e 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -2,7 +2,6 @@ from __future__ import annotations -import copy from collections import defaultdict from itertools import chain from typing import TYPE_CHECKING, Any @@ -245,36 +244,26 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: explicit_datasets = { ds_name: ds_config - for ds_name, ds_config in catalog_config.items() - if not data_catalog._is_pattern(ds_name) + for ds_name, ds_config in data_catalog.datasets_config.items() + if not is_parameter(ds_name) } target_pipelines = pipelines.keys() - datasets = set() + pipeline_datasets = set() for pipe in target_pipelines: pl_obj = pipelines.get(pipe) if pl_obj: - datasets.update(pl_obj.datasets()) + pipeline_datasets.update(pl_obj.datasets()) - for ds_name in datasets: - is_param = ds_name.startswith("params:") or ds_name == "parameters" - if ds_name in explicit_datasets or is_param: + for ds_name in pipeline_datasets: + if ds_name in explicit_datasets or is_parameter(ds_name): continue - matched_pattern = data_catalog._match_pattern( - data_catalog._dataset_patterns, ds_name - ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name) - if matched_pattern: - ds_config_copy = copy.deepcopy( - data_catalog._dataset_patterns.get(matched_pattern) - or data_catalog._default_pattern.get(matched_pattern) - or {} - ) + ds_config = data_catalog.config_resolver.resolve_dataset_pattern(ds_name) - ds_config = data_catalog._resolve_config( - ds_name, matched_pattern, ds_config_copy - ) + # Exclude MemoryDatasets not set in the catalog explicitly + if ds_config is not None: explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index abf9bf6353..75cd65bf67 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -190,9 +190,13 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] ) @property - def config_resolver(self): + def config_resolver(self) -> DataCatalogConfigResolver: return self._config_resolver + @property + def datasets_config(self) -> dict[str, dict[str, Any]]: + return self._datasets_config + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From e67ff0f7a16f22006631f80a54243dde69d25388 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 19:02:36 +0100 Subject: [PATCH 047/173] Remove some methods Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 75cd65bf67..9b7ff7404e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -171,12 +171,6 @@ def __init__( # noqa: PLR0913 def __repr__(self) -> str: return self.datasets.__repr__() - def __iter__(self): - yield from self._datasets.values() - - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self._get_dataset(ds_name) - def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" return dataset_name in self._datasets or self._config_resolver.match_pattern( From 7b3afa21862089c4856a1b9a73ae726499eae091 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 11:40:22 +0100 Subject: [PATCH 048/173] Removed ds configs from catalog Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 4 ++-- kedro/io/data_catalog.py | 27 +++++++-------------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 297e0dcb9e..900d34d7e2 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -244,8 +244,8 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: explicit_datasets = { ds_name: ds_config - for ds_name, ds_config in data_catalog.datasets_config.items() - if not is_parameter(ds_name) + for ds_name, ds_config in catalog_config.items() + if not data_catalog.config_resolver.is_pattern(ds_name) } target_pipelines = pipelines.keys() diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 9b7ff7404e..e56610b5c2 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -105,8 +105,8 @@ def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, - dataset_patterns: Patterns | None = None, - default_pattern: Patterns | None = None, + dataset_patterns: Patterns | None = None, # Kept for interface compatibility + default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, config_resolver: DataCatalogConfigResolver = None, @@ -115,7 +115,7 @@ def __init__( # noqa: PLR0913 implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a - single point of reference for your calls, relaying load and save + single point of reference for your calls, relaying load and save§ functions to the underlying data sets. Args: @@ -154,11 +154,10 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets_config = self._config_resolver.config self._datasets = {} self.datasets = {} - self.add_all(dict(datasets) or {}, datasets_configs=self._datasets_config) + self.add_all(dict(datasets) or {}) self._load_versions = load_versions or {} self._save_version = save_version @@ -180,17 +179,13 @@ def __contains__(self, dataset_name: str) -> bool: def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] return (self._datasets, self._config_resolver.dataset_patterns) == ( other._datasets, - other._config_resolver._dataset_patterns, + other._config_resolver.dataset_patterns, ) @property def config_resolver(self) -> DataCatalogConfigResolver: return self._config_resolver - @property - def datasets_config(self) -> dict[str, dict[str, Any]]: - return self._datasets_config - @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -324,7 +319,7 @@ def _get_dataset( self._load_versions.get(dataset_name), self._save_version, ) - self.add(dataset_name, ds, dataset_config=ds_config) + self.add(dataset_name, ds) if dataset_name not in self._datasets: error_msg = f"Dataset '{dataset_name}' not found in the catalog" @@ -464,7 +459,6 @@ def add( dataset_name: str, dataset: AbstractDataset, replace: bool = False, - dataset_config: dict[str, Any] | None = None, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -475,8 +469,6 @@ def add( set name. replace: Specifies whether to replace an existing dataset with the same name is allowed. - dataset_config: A dictionary with dataset configuration. - Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -501,16 +493,12 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset - self._datasets_config[dataset_name] = ( - dataset_config if dataset_config is not None else {} - ) self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( self, datasets: dict[str, AbstractDataset], replace: bool = False, - datasets_configs: dict[str, dict[str, Any]] | None = None, ) -> None: """Adds a group of new data sets to the ``DataCatalog``. @@ -519,7 +507,6 @@ def add_all( instances. replace: Specifies whether to replace an existing dataset with the same name is allowed. - datasets_configs: A dictionary of dataset configurations. Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -543,7 +530,7 @@ def add_all( >>> assert catalog.list() == ["cars", "planes", "boats"] """ for ds_name, ds in datasets.items(): - self.add(ds_name, ds, replace, datasets_configs.get(ds_name, {})) + self.add(ds_name, ds, replace) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: """Add datasets to the ``DataCatalog`` using the data provided through the `feed_dict`. From 658a759628d2e55e958790229a5f7fb7c637bbfd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 12:41:49 +0100 Subject: [PATCH 049/173] Fixed lint Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 2 +- kedro/io/catalog_config_resolver.py | 19 +++++++++++-------- kedro/io/data_catalog.py | 24 ++++++++++++++---------- kedro/runner/parallel_runner.py | 6 ++++-- kedro/runner/runner.py | 3 ++- kedro/runner/sequential_runner.py | 6 ++++-- kedro/runner/thread_runner.py | 6 ++++-- 7 files changed, 40 insertions(+), 26 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 900d34d7e2..37ffebd13c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -98,7 +98,7 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: ) for ds_name, ds_config in zip(default_ds, resolved_configs): if data_catalog.config_resolver.match_pattern(ds_name): - factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( # type: ignore[attr-defined] ds_name ) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 1236fd397b..0aaae32af8 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -7,12 +7,14 @@ from parse import parse -Patterns = dict[str, dict[str, Any]] +Patterns = dict[str, dict[str, Any] | None] CREDENTIALS_KEY = "credentials" -def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: +def _fetch_credentials( + credentials_name: str, credentials: dict[str, Any] | None +) -> Any: """Fetch the specified credentials from the provided credentials dictionary. Args: @@ -27,6 +29,8 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An registered. """ + if credentials is None: + return None try: return credentials[credentials_name] except KeyError as exc: @@ -40,7 +44,7 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An def _resolve_credentials( config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any]: +) -> dict[str, Any] | None: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -67,7 +71,7 @@ def _resolve_dataset_config( ds_name: str, pattern: str, config: Any, -) -> dict[str, Any]: +) -> Any: """Resolve dataset configuration based on the provided pattern.""" resolved_vars = parse(pattern, ds_name) # Resolve the factory config for the dataset @@ -102,7 +106,7 @@ def __init__( self._resolved_configs = self._init_configs(config, credentials) @property - def config(self) -> dict[str, dict[str, Any]]: + def config(self) -> dict[str, dict[str, Any] | None]: return self._resolved_configs @property @@ -183,8 +187,7 @@ def _extract_patterns( for ds_name, ds_config in config.items(): if cls.is_pattern(ds_name): - resolved_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_config + dataset_patterns[ds_name] = _resolve_credentials(ds_config, credentials) sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -199,7 +202,7 @@ def _init_configs( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - ) -> dict[str, dict[str, Any]]: + ) -> dict[str, dict[str, Any] | None]: """Initialize the dataset configuration with resolved credentials.""" # TODO: check if deep copies are required config = copy.deepcopy(config) or {} diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index e56610b5c2..be488a71c4 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -47,13 +47,15 @@ class _FrozenDatasets: def __init__( self, - *datasets_collections: _FrozenDatasets | dict[str, AbstractDataset], + *datasets_collections: _FrozenDatasets | dict[str, AbstractDataset] | None, ): """Return a _FrozenDatasets instance from some datasets collections. Each collection could either be another _FrozenDatasets or a dictionary. """ self._original_names: dict[str, str] = {} for collection in datasets_collections: + if collection is None: + continue if isinstance(collection, _FrozenDatasets): self.__dict__.update(collection.__dict__) self._original_names.update(collection._original_names) @@ -109,7 +111,7 @@ def __init__( # noqa: PLR0913 default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: DataCatalogConfigResolver = None, + config_resolver: DataCatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -154,10 +156,10 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets = {} - self.datasets = {} + self._datasets: dict[str, AbstractDataset] = {} + self.datasets: _FrozenDatasets | None = None - self.add_all(dict(datasets) or {}) + self.add_all(datasets or {}) self._load_versions = load_versions or {} self._save_version = save_version @@ -172,8 +174,9 @@ def __repr__(self) -> str: def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - return dataset_name in self._datasets or self._config_resolver.match_pattern( - dataset_name + return ( + dataset_name in self._datasets + or self._config_resolver.match_pattern(dataset_name) is not None ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] @@ -267,6 +270,7 @@ class to be loaded is specified with the key ``type`` and their >>> df = catalog.load("cars") >>> catalog.save("boats", df) """ + catalog = catalog or {} datasets = {} config_resolver = DataCatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() @@ -276,7 +280,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name], + config_resolver.config[ds_name] or {}, load_versions.get(ds_name), save_version, ) @@ -315,7 +319,7 @@ def _get_dataset( if dataset_name not in self._datasets and ds_config is not None: ds = AbstractDataset.from_config( dataset_name, - ds_config, + ds_config, # type: ignore[arg-type] self._load_versions.get(dataset_name), self._save_version, ) @@ -571,7 +575,7 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non dataset = ( ds_data if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) + else MemoryDataset(data=ds_data) # type: ignore[abstract] ) # type: ignore[abstract] self.add(ds_name, dataset, replace) diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 62d7e1216b..e88d197b5b 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -106,7 +106,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """ Instantiates the runner by creating a Manager. @@ -125,7 +125,9 @@ def __init__( Raises: ValueError: bad parameters passed """ - default_dataset_pattern = {"{default}": {"type": "SharedMemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "SharedMemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 6f165e87c0..81436f7028 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: from pluggy import PluginManager + from kedro.io.catalog_config_resolver import Patterns from kedro.pipeline.node import Node @@ -38,7 +39,7 @@ class AbstractRunner(ABC): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: Patterns | None = None, ): """Instantiates the runner class. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 48dac3cd54..8b22ba89f5 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -27,7 +27,7 @@ class SequentialRunner(AbstractRunner): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """Instantiates the runner class. @@ -39,7 +39,9 @@ def __init__( for `SequentialRunner`. """ - default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "SharedMemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index b4751a602a..802f7f7de0 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -31,7 +31,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """ Instantiates the runner. @@ -56,7 +56,9 @@ def __init__( "node inputs and outputs asynchronously with threads. " "Setting 'is_async' to False." ) - default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "MemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=False, extra_dataset_patterns=self._extra_dataset_patterns From 7be2a8e0d78016ec761cc7fe2406b0a49caa0cb4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 12:45:13 +0100 Subject: [PATCH 050/173] Fixed typo Signed-off-by: Elena Khaustova --- kedro/runner/sequential_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 8b22ba89f5..d4dab2628a 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -40,7 +40,7 @@ def __init__( """ default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "SharedMemoryDataset"} + "{default}": {"type": "MemoryDataset"} } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( From 9e43a9a7cf00baa81ecd8622d72c9ee4a97d539e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 13:56:17 +0100 Subject: [PATCH 051/173] Added module docstring Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 0aaae32af8..b9651157e2 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,3 +1,7 @@ +"""``DataCatalogConfigResolver`` resolves dataset configurations and datasets' +patterns based on catalog configuration and credentials provided. +""" + from __future__ import annotations import copy From 49a3b275c32fc2b7a18df0d84b12351fa173b1b1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:40:11 +0100 Subject: [PATCH 052/173] Renaming methods Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 590a187946..a461416829 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -6,6 +6,7 @@ import re from typing import Any +from kedro.io.catalog_config_resolver import DataCatalogConfigResolver from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -37,6 +38,7 @@ def __init__( load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> None: + self._config_resolver = DataCatalogConfigResolver() self._config = config or {} self._datasets = datasets or {} self._load_versions = load_versions or {} @@ -227,7 +229,10 @@ def load(self, name: str, version: str | None = None) -> Any: return dataset.load() - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + # TODO: rename to add_from_dict after removing old catalog + # Consider changing to add memory datasets only, to simplify the method, + # adding AbstractDataset can be done via add() method for ds_name, ds_data in datasets.items(): dataset = ( ds_data @@ -235,3 +240,22 @@ def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None else MemoryDataset(data=ds_data) ) # type: ignore[abstract] self.add(ds_name, dataset, replace) + + # def shallow_copy( + # self, extra_dataset_patterns: Patterns | None = None + # ) -> KedroDataCatalog: + # """Returns a shallow copy of the current object. + # + # Returns: + # Copy of the current object. + # """ + # if extra_dataset_patterns: + # self._config_resolver.add_runtime_patterns(extra_dataset_patterns) + # return self.__class__( + # datasets=self._datasets, + # dataset_patterns=self._config_resolver.dataset_patterns, + # default_pattern=self._config_resolver.default_pattern, + # load_versions=self._load_versions, + # save_version=self._save_version, + # config_resolver=self._config_resolver, + # ) From 25b65019eee17db68c492decc6750596e5837f6c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:49:41 +0100 Subject: [PATCH 053/173] Removed None from Pattern type Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 10 +++++----- kedro/io/data_catalog.py | 2 +- kedro/runner/parallel_runner.py | 6 ++---- kedro/runner/runner.py | 3 +-- kedro/runner/sequential_runner.py | 6 ++---- kedro/runner/thread_runner.py | 6 ++---- 6 files changed, 13 insertions(+), 20 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index b9651157e2..08cb26adcf 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -11,7 +11,7 @@ from parse import parse -Patterns = dict[str, dict[str, Any] | None] +Patterns = dict[str, dict[str, Any]] CREDENTIALS_KEY = "credentials" @@ -47,8 +47,8 @@ def _fetch_credentials( def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any] | None: + config: dict[str, Any], credentials: dict[str, Any] +) -> dict[str, Any]: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -110,7 +110,7 @@ def __init__( self._resolved_configs = self._init_configs(config, credentials) @property - def config(self) -> dict[str, dict[str, Any] | None]: + def config(self) -> dict[str, dict[str, Any]]: return self._resolved_configs @property @@ -206,7 +206,7 @@ def _init_configs( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - ) -> dict[str, dict[str, Any] | None]: + ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" # TODO: check if deep copies are required config = copy.deepcopy(config) or {} diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index be488a71c4..7a54765740 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -576,7 +576,7 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non ds_data if isinstance(ds_data, AbstractDataset) else MemoryDataset(data=ds_data) # type: ignore[abstract] - ) # type: ignore[abstract] + ) self.add(ds_name, dataset, replace) def list(self, regex_search: str | None = None) -> list[str]: diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index e88d197b5b..62d7e1216b 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -106,7 +106,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """ Instantiates the runner by creating a Manager. @@ -125,9 +125,7 @@ def __init__( Raises: ValueError: bad parameters passed """ - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "SharedMemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "SharedMemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 81436f7028..6f165e87c0 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -27,7 +27,6 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io.catalog_config_resolver import Patterns from kedro.pipeline.node import Node @@ -39,7 +38,7 @@ class AbstractRunner(ABC): def __init__( self, is_async: bool = False, - extra_dataset_patterns: Patterns | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """Instantiates the runner class. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index d4dab2628a..48dac3cd54 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -27,7 +27,7 @@ class SequentialRunner(AbstractRunner): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """Instantiates the runner class. @@ -39,9 +39,7 @@ def __init__( for `SequentialRunner`. """ - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "MemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index 802f7f7de0..b4751a602a 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -31,7 +31,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """ Instantiates the runner. @@ -56,9 +56,7 @@ def __init__( "node inputs and outputs asynchronously with threads. " "Setting 'is_async' to False." ) - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "MemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=False, extra_dataset_patterns=self._extra_dataset_patterns From 3a646de40892fddcf04776a4421a75e078e49201 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:54:45 +0100 Subject: [PATCH 054/173] Fixed docs failing to find class reference Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 562f5a4b0e..4ac54eefbc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -127,6 +127,7 @@ "typing.Type", "typing.Set", "kedro.config.config.ConfigLoader", + "kedro.io.catalog_config_resolver.DataCatalogConfigResolver", "kedro.io.core.AbstractDataset", "kedro.io.core.AbstractVersionedDataset", "kedro.io.core.DatasetError", From 5e5df4ab7a044c95094dd62b3d998b7b0b87b970 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:59:16 +0100 Subject: [PATCH 055/173] Fixed docs failing to find class reference Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4ac54eefbc..635a5220a0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -169,6 +169,7 @@ "D[k] if k in D, else d. d defaults to None.", "None. Update D from mapping/iterable E and F.", "Patterns", + "DataCatalogConfigResolver", ), "py:data": ( "typing.Any", From aa59a35b16520e99d069067672d909c9df4eccca Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 15:17:17 +0100 Subject: [PATCH 056/173] Updated Patterns type Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 08cb26adcf..f9d20294fd 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -7,11 +7,11 @@ import copy import logging import re -from typing import Any +from typing import Any, Dict from parse import parse -Patterns = dict[str, dict[str, Any]] +Patterns = Dict[str, Dict[str, Any]] CREDENTIALS_KEY = "credentials" From c7efa3e5954fbe6d518426cd9392a47b60b78ac9 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:35:09 +0100 Subject: [PATCH 057/173] Fix tests (#4149) * Fix most tests Signed-off-by: Ankita Katiyar * Fix most tests Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro/io/catalog_config_resolver.py | 8 ++++++++ kedro/io/data_catalog.py | 6 +++++- tests/framework/cli/test_catalog.py | 3 +-- tests/framework/session/test_session.py | 2 +- tests/io/test_data_catalog.py | 26 ++++++++++++------------- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index f9d20294fd..d771363c90 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -11,6 +11,8 @@ from parse import parse +from kedro.io.core import DatasetError + Patterns = Dict[str, Dict[str, Any]] CREDENTIALS_KEY = "credentials" @@ -214,6 +216,12 @@ def _init_configs( resolved_configs = {} for ds_name, ds_config in config.items(): + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) if not self.is_pattern(ds_name): resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7a54765740..9c031c56ac 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -14,7 +14,11 @@ import re from typing import Any -from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns +from kedro.io.catalog_config_resolver import ( + CREDENTIALS_KEY, # noqa: F401 + DataCatalogConfigResolver, + Patterns, +) from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index f34034296e..b898b270f9 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -490,7 +490,6 @@ def test_rank_catalog_factories( mocked_context.catalog = DataCatalog.from_config( fake_catalog_with_overlapping_factories ) - print("!!!!", mocked_context.catalog._dataset_patterns) result = CliRunner().invoke( fake_project_cli, ["catalog", "rank"], obj=fake_metadata ) @@ -547,7 +546,7 @@ def test_catalog_resolve( mocked_context.catalog = DataCatalog.from_config( catalog=fake_catalog_config, credentials=fake_credentials_config ) - placeholder_ds = mocked_context.catalog._dataset_patterns.keys() + placeholder_ds = mocked_context.catalog.config_resolver.dataset_patterns.keys() pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} mocker.patch.object( diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index bc25db37c7..71c2fbb2f5 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -730,7 +730,7 @@ def test_run_thread_runner( } mocker.patch("kedro.framework.session.session.pipelines", pipelines_ret) mocker.patch( - "kedro.io.data_catalog.DataCatalog._match_pattern", + "kedro.io.data_catalog.DataCatalogConfigResolver.match_pattern", return_value=match_pattern, ) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index dbec57e64d..0e3e44a6b5 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -846,7 +846,7 @@ def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): catalog = DataCatalog.from_config(**config_with_dataset_factories) assert "{brand}_cars" not in catalog._datasets assert "tesla_cars" not in catalog._datasets - assert "{brand}_cars" in catalog._dataset_patterns + assert "{brand}_cars" in catalog.config_resolver._dataset_patterns tesla_cars = catalog._get_dataset("tesla_cars") assert isinstance(tesla_cars, CSVDataset) @@ -875,8 +875,8 @@ def test_patterns_not_in_catalog_datasets(self, config_with_dataset_factories): catalog = DataCatalog.from_config(**config_with_dataset_factories) assert "audi_cars" in catalog._datasets assert "{brand}_cars" not in catalog._datasets - assert "audi_cars" not in catalog._dataset_patterns - assert "{brand}_cars" in catalog._dataset_patterns + assert "audi_cars" not in catalog.config_resolver._dataset_patterns + assert "{brand}_cars" in catalog.config_resolver._dataset_patterns def test_explicit_entry_not_overwritten(self, config_with_dataset_factories): """Check that the existing catalog entry is not overwritten by config in pattern""" @@ -909,11 +909,7 @@ def test_sorting_order_patterns(self, config_with_dataset_factories_only_pattern "{dataset}s", "{user_default}", ] - assert ( - list(catalog._dataset_patterns.keys()) - + list(catalog._default_pattern.keys()) - == sorted_keys_expected - ) + assert catalog.config_resolver.list_patterns() == sorted_keys_expected def test_multiple_catch_all_patterns_not_allowed( self, config_with_dataset_factories @@ -929,7 +925,7 @@ def test_multiple_catch_all_patterns_not_allowed( } with pytest.raises( - DatasetError, match="Multiple catch-all patterns found in the catalog" + ValueError, match="Multiple catch-all patterns found in the catalog" ): DataCatalog.from_config(**config_with_dataset_factories) @@ -959,7 +955,7 @@ def test_sorting_order_with_other_dataset_through_extra_pattern( "{default}", ] assert ( - list(catalog_with_default._dataset_patterns.keys()) == sorted_keys_expected + catalog_with_default.config_resolver.list_patterns() == sorted_keys_expected ) def test_user_default_overwrites_runner_default(self): @@ -989,10 +985,12 @@ def test_user_default_overwrites_runner_default(self): "{dataset}s", "{a_default}", ] - assert "{a_default}" in catalog_with_runner_default._default_pattern assert ( - list(catalog_with_runner_default._dataset_patterns.keys()) - + list(catalog_with_runner_default._default_pattern.keys()) + "{a_default}" in catalog_with_runner_default.config_resolver.default_pattern + ) + assert ( + list(catalog_with_runner_default.config_resolver.dataset_patterns.keys()) + + list(catalog_with_runner_default.config_resolver.default_pattern.keys()) == sorted_keys_expected ) @@ -1019,7 +1017,7 @@ def test_unmatched_key_error_when_parsing_config( "Unable to resolve 'data/01_raw/{brand}_plane.pq' from the pattern '{type}@planes'. " "Keys used in the configuration should be present in the dataset factory pattern." ) - with pytest.raises(DatasetError, match=re.escape(pattern)): + with pytest.raises(KeyError, match=re.escape(pattern)): catalog._get_dataset("jet@planes") def test_factory_config_versioned( From 023ffc629924c7b318a020373c04a8f933a9ad23 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 16:39:57 +0100 Subject: [PATCH 058/173] Returned constants to avoid breaking changes Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 9c031c56ac..f5597c6cc0 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -31,6 +31,7 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler +CATALOG_KEY = "catalog" # Kept to avoid the breaking change WORDS_REGEX_PATTERN = re.compile(r"\W+") From d57a567167ee44669a52cf747bcf765f390d1f31 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 18:21:08 +0100 Subject: [PATCH 059/173] Udapted KedroDataCatalog for recent changes Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 148 ++++++++++++++++++------------ 1 file changed, 90 insertions(+), 58 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index a461416829..a0451fd4ad 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -1,3 +1,10 @@ +"""``KedroDataCatalog`` stores instances of ``AbstractDataset`` implementations to +provide ``load`` and ``save`` capabilities from anywhere in the program. To +use a ``KedroDataCatalog``, you need to instantiate it with a dictionary of data +sets. Then it will act as a single point of reference for your calls, +relaying load and save functions to the underlying data sets. +""" + from __future__ import annotations import copy @@ -6,7 +13,7 @@ import re from typing import Any -from kedro.io.catalog_config_resolver import DataCatalogConfigResolver +from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -14,6 +21,7 @@ DatasetError, DatasetNotFoundError, Version, + generate_timestamp, ) from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler @@ -33,26 +41,24 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: class KedroDataCatalog: def __init__( self, - datasets: dict[str, Any] | None = None, - config: dict[str, dict[str, Any]] | None = None, + datasets: dict[str, AbstractDataset] | None = None, + feed_dict: dict[str, Any] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, + config_resolver: DataCatalogConfigResolver | None = None, ) -> None: - self._config_resolver = DataCatalogConfigResolver() - self._config = config or {} + self._config_resolver = config_resolver or DataCatalogConfigResolver() self._datasets = datasets or {} self._load_versions = load_versions or {} self._save_version = save_version - self._use_rich_markup = _has_rich_handler() - for ds_name in self._datasets: - # TODO: API to get configuration from dataset - self._config[ds_name] = {} + self._use_rich_markup = _has_rich_handler() - for ds_name, ds_config in self._config.items(): - self.init_dataset(ds_name, ds_config) + for ds_name, ds_config in self._config_resolver.config.items(): + self._init_dataset(ds_name, ds_config) - self._validate_missing_keys() + if feed_dict: + self.add_feed_dict(feed_dict) @property def datasets(self) -> dict[str, Any]: @@ -65,14 +71,8 @@ def datasets(self, value: Any): ) @property - def config(self): - return copy.deepcopy(self._config) - - @config.setter - def config(self, value: Any) -> dict[str, dict[str, Any]]: - raise AttributeError( - "Operation not allowed! Please change datasets through configuration." - ) + def config_resolver(self) -> DataCatalogConfigResolver: + return self._config_resolver def __iter__(self): yield from self._datasets.values() @@ -87,26 +87,65 @@ def __contains__(self, ds_name: str) -> bool: def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: + @classmethod + def from_config( + cls, + catalog: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> KedroDataCatalog: + """Create a ``DataCatalog`` instance from configuration. This is a + factory method used to provide developers with a way to instantiate + ``DataCatalog`` with configuration parsed from configuration files. + """ + catalog = catalog or {} + config_resolver = DataCatalogConfigResolver(catalog, credentials) + save_version = save_version or generate_timestamp() + load_versions = load_versions or {} + + missing_keys = [ + ds_name + for ds_name in load_versions + if not ( + ds_name in config_resolver.config + or config_resolver.match_pattern(ds_name) + ) + ] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) + + return cls( + load_versions=load_versions, + save_version=save_version, + config_resolver=config_resolver, + ) + + def _init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save # Add is_init property validate_dataset_config(ds_name, ds_config) - if ds_name in self._datasets: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._config[ds_name] = ds_config - self._datasets[ds_name] = AbstractDataset.from_config( + ds = AbstractDataset.from_config( ds_name, ds_config, self._load_versions.get(ds_name), self._save_version, ) + self.add(ds_name, ds) + def get_dataset( - self, ds_name: str, suggest: bool = True, version: Version | None = None + self, ds_name: str, version: Version | None = None, suggest: bool = True ) -> AbstractDataset: + ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) + + if ds_name not in self._datasets and ds_config is not None: + self._init_dataset(ds_name, ds_config) + dataset = self._datasets.get(ds_name, None) if dataset is None: @@ -127,6 +166,12 @@ def get_dataset( return dataset + def _get_dataset( + self, dataset_name: str, version: Version | None = None, suggest: bool = True + ) -> AbstractDataset: + # TODO: remove when removing old catalog + return self.get_dataset(dataset_name, version, suggest) + def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: @@ -139,8 +184,6 @@ def add( f"Dataset '{ds_name}' has already been registered" ) self._datasets[ds_name] = dataset - # TODO: API to get configuration from dataset - self._config[ds_name] = {} @property def _logger(self) -> logging.Logger: @@ -207,14 +250,6 @@ def confirm(self, name: str) -> None: else: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - def _validate_missing_keys(self) -> None: - missing_keys = [key for key in self._load_versions if key not in self._config] - if missing_keys: - raise DatasetNotFoundError( - f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " - f"are not found in the catalog." - ) - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set.""" load_version = Version(version, None) if version else None @@ -229,8 +264,7 @@ def load(self, name: str, version: str | None = None) -> Any: return dataset.load() - def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - # TODO: rename to add_from_dict after removing old catalog + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: # Consider changing to add memory datasets only, to simplify the method, # adding AbstractDataset can be done via add() method for ds_name, ds_data in datasets.items(): @@ -241,21 +275,19 @@ def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None ) # type: ignore[abstract] self.add(ds_name, dataset, replace) - # def shallow_copy( - # self, extra_dataset_patterns: Patterns | None = None - # ) -> KedroDataCatalog: - # """Returns a shallow copy of the current object. - # - # Returns: - # Copy of the current object. - # """ - # if extra_dataset_patterns: - # self._config_resolver.add_runtime_patterns(extra_dataset_patterns) - # return self.__class__( - # datasets=self._datasets, - # dataset_patterns=self._config_resolver.dataset_patterns, - # default_pattern=self._config_resolver.default_pattern, - # load_versions=self._load_versions, - # save_version=self._save_version, - # config_resolver=self._config_resolver, - # ) + def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: + # TODO: remove when removing old catalog + return self.add_from_dict(feed_dict, replace) + + def shallow_copy( + self, extra_dataset_patterns: Patterns | None = None + ) -> KedroDataCatalog: + # TODO: remove when old catalog + """Returns a shallow copy of the current object. + + Returns: + Copy of the current object. + """ + if extra_dataset_patterns: + self._config_resolver.add_runtime_patterns(extra_dataset_patterns) + return self From 585b44f7315854a4dad2c7a271cf872d0ec37231 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 18:26:09 +0100 Subject: [PATCH 060/173] Minor fix Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f5597c6cc0..778fbb3bb4 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name] or {}, + config_resolver.config[ds_name], load_versions.get(ds_name), save_version, ) From e447078908690503026a0cc98546dff29d6f9649 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:12:57 +0100 Subject: [PATCH 061/173] Updated test_sorting_order_with_other_dataset_through_extra_pattern Signed-off-by: Elena Khaustova --- tests/io/test_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 0e3e44a6b5..9a118d8110 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -949,9 +949,9 @@ def test_sorting_order_with_other_dataset_through_extra_pattern( ) sorted_keys_expected = [ "{country}_companies", - "{another}#csv", "{namespace}_{dataset}", "{dataset}s", + "{another}#csv", "{default}", ] assert ( From 975e9685c937755511d2d7ab17b42cf0100e328e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:52:07 +0100 Subject: [PATCH 062/173] Removed odd properties Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 7 +------ kedro/io/catalog_config_resolver.py | 8 -------- kedro/io/data_catalog.py | 12 ++++++------ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 37ffebd13c..f6a58664fb 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -214,12 +214,7 @@ def rank_catalog_factories(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - catalog_factories = list( - { - **context.catalog.config_resolver.dataset_patterns, - **context.catalog.config_resolver.default_pattern, - }.keys() - ) + catalog_factories = context.catalog.config_resolver.list_patterns() if catalog_factories: click.echo(yaml.dump(catalog_factories)) else: diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index d771363c90..8fc0c82aa6 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -115,14 +115,6 @@ def __init__( def config(self) -> dict[str, dict[str, Any]]: return self._resolved_configs - @property - def dataset_patterns(self) -> Patterns: - return self._dataset_patterns - - @property - def default_pattern(self) -> Patterns: - return self._default_pattern - @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 778fbb3bb4..7cd8c31690 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -185,9 +185,9 @@ def __contains__(self, dataset_name: str) -> bool: ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._config_resolver.dataset_patterns) == ( + return (self._datasets, self._config_resolver.list_patterns()) == ( other._datasets, - other._config_resolver.dataset_patterns, + other.config_resolver.list_patterns(), ) @property @@ -306,8 +306,8 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, - dataset_patterns=config_resolver.dataset_patterns, - default_pattern=config_resolver.default_pattern, + dataset_patterns=config_resolver._dataset_patterns, + default_pattern=config_resolver._default_pattern, load_versions=load_versions, save_version=save_version, config_resolver=config_resolver, @@ -641,8 +641,8 @@ def shallow_copy( self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self.__class__( datasets=self._datasets, - dataset_patterns=self._config_resolver.dataset_patterns, - default_pattern=self._config_resolver.default_pattern, + dataset_patterns=self._config_resolver._dataset_patterns, + default_pattern=self._config_resolver._default_pattern, load_versions=self._load_versions, save_version=self._save_version, config_resolver=self._config_resolver, From 11d782cf71f5ddc8bd2689de73e5cc7fd75afbd8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:58:55 +0100 Subject: [PATCH 063/173] Updated tests Signed-off-by: Elena Khaustova --- tests/framework/cli/test_catalog.py | 2 +- tests/io/test_data_catalog.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index b898b270f9..7a61c9e7a0 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -546,7 +546,7 @@ def test_catalog_resolve( mocked_context.catalog = DataCatalog.from_config( catalog=fake_catalog_config, credentials=fake_credentials_config ) - placeholder_ds = mocked_context.catalog.config_resolver.dataset_patterns.keys() + placeholder_ds = mocked_context.catalog.config_resolver.list_patterns() pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} mocker.patch.object( diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 9a118d8110..be8ed0831e 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -984,13 +984,15 @@ def test_user_default_overwrites_runner_default(self): sorted_keys_expected = [ "{dataset}s", "{a_default}", + "{another}#csv", + "{default}", ] assert ( - "{a_default}" in catalog_with_runner_default.config_resolver.default_pattern + "{a_default}" + in catalog_with_runner_default.config_resolver._default_pattern ) assert ( - list(catalog_with_runner_default.config_resolver.dataset_patterns.keys()) - + list(catalog_with_runner_default.config_resolver.default_pattern.keys()) + catalog_with_runner_default.config_resolver.list_patterns() == sorted_keys_expected ) From e4abd2311f25534cf936635848f3e5d50883ed5f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 12:57:53 +0100 Subject: [PATCH 064/173] Removed None from _fetch_credentials input Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 8fc0c82aa6..73aeb4a830 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -18,9 +18,7 @@ CREDENTIALS_KEY = "credentials" -def _fetch_credentials( - credentials_name: str, credentials: dict[str, Any] | None -) -> Any: +def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: """Fetch the specified credentials from the provided credentials dictionary. Args: @@ -35,8 +33,6 @@ def _fetch_credentials( registered. """ - if credentials is None: - return None try: return credentials[credentials_name] except KeyError as exc: From f9cb9c6804d366bc9df8319a0c9145077f07a009 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 15:29:07 +0100 Subject: [PATCH 065/173] Updated specs and context Signed-off-by: Elena Khaustova --- kedro/framework/context/context.py | 16 +++++++-------- kedro/framework/hooks/specs.py | 31 ++++++++++++++++-------------- kedro/io/__init__.py | 3 ++- kedro/io/data_catalog_redesign.py | 10 ++++++++-- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 3b61b747f6..245c64e0b9 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -14,7 +14,7 @@ from kedro.config import AbstractConfigLoader, MissingConfigException from kedro.framework.project import settings -from kedro.io import DataCatalog # noqa: TCH001 +from kedro.io import AbstractDataCatalog, DataCatalog # noqa: TCH001 from kedro.pipeline.transcoding import _transcode_split if TYPE_CHECKING: @@ -123,7 +123,7 @@ def _convert_paths_to_absolute_posix( return conf_dictionary -def _validate_transcoded_datasets(catalog: DataCatalog) -> None: +def _validate_transcoded_datasets(catalog: AbstractDataCatalog) -> None: """Validates transcoded datasets are correctly named Args: @@ -178,13 +178,13 @@ class KedroContext: ) @property - def catalog(self) -> DataCatalog: - """Read-only property referring to Kedro's ``DataCatalog`` for this context. + def catalog(self) -> AbstractDataCatalog: + """Read-only property referring to Kedro's ``AbstractDataCatalog`` for this context. Returns: DataCatalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. + KedroContextError: Incorrect ``AbstractDataCatalog`` registered for the project. """ return self._get_catalog() @@ -213,13 +213,13 @@ def _get_catalog( self, save_version: str | None = None, load_versions: dict[str, str] | None = None, - ) -> DataCatalog: - """A hook for changing the creation of a DataCatalog instance. + ) -> AbstractDataCatalog: + """A hook for changing the creation of a AbstractDataCatalog instance. Returns: DataCatalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. + KedroContextError: Incorrect ``AbstractDataCatalog`` registered for the project. """ # '**/catalog*' reads modular pipeline configs diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index b0037a0878..0a7f0c9295 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from kedro.framework.context import KedroContext - from kedro.io import DataCatalog + from kedro.io import AbstractDataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -22,7 +22,7 @@ class DataCatalogSpecs: @hook_spec def after_catalog_created( # noqa: PLR0913 self, - catalog: DataCatalog, + catalog: AbstractDataCatalog, conf_catalog: dict[str, Any], conf_creds: dict[str, Any], feed_dict: dict[str, Any], @@ -53,7 +53,7 @@ class NodeSpecs: def before_node_run( self, node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -63,7 +63,7 @@ def before_node_run( Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -81,7 +81,7 @@ def before_node_run( def after_node_run( # noqa: PLR0913 self, node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, inputs: dict[str, Any], outputs: dict[str, Any], is_async: bool, @@ -93,7 +93,7 @@ def after_node_run( # noqa: PLR0913 Args: node: The ``Node`` that ran. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -110,7 +110,7 @@ def on_node_error( # noqa: PLR0913 self, error: Exception, node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -122,7 +122,7 @@ def on_node_error( # noqa: PLR0913 Args: error: The uncaught exception thrown during the node run. node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -137,7 +137,10 @@ class PipelineSpecs: @hook_spec def before_pipeline_run( - self, run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog + self, + run_params: dict[str, Any], + pipeline: Pipeline, + catalog: AbstractDataCatalog, ) -> None: """Hook to be invoked before a pipeline runs. @@ -164,7 +167,7 @@ def before_pipeline_run( } pipeline: The ``Pipeline`` that will be run. - catalog: The ``DataCatalog`` to be used during the run. + catalog: The ``AbstractDataCatalog`` to be used during the run. """ pass @@ -174,7 +177,7 @@ def after_pipeline_run( run_params: dict[str, Any], run_result: dict[str, Any], pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. @@ -202,7 +205,7 @@ def after_pipeline_run( run_result: The output of ``Pipeline`` run. pipeline: The ``Pipeline`` that was run. - catalog: The ``DataCatalog`` used during the run. + catalog: The ``AbstractDataCatalog`` used during the run. """ pass @@ -212,7 +215,7 @@ def on_pipeline_error( error: Exception, run_params: dict[str, Any], pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, ) -> None: """Hook to be invoked if a pipeline run throws an uncaught Exception. The signature of this error hook should match the signature of ``before_pipeline_run`` @@ -242,7 +245,7 @@ def on_pipeline_error( } pipeline: The ``Pipeline`` that will was run. - catalog: The ``DataCatalog`` used during the run. + catalog: The ``AbstractDataCatalog`` used during the run. """ pass diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index db3c295449..53b77cceba 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -15,13 +15,14 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import KedroDataCatalog +from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset __all__ = [ "AbstractDataset", + "AbstractDataCatalog", "AbstractVersionedDataset", "CachedDataset", "DataCatalog", diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index a0451fd4ad..cb36649571 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -11,8 +11,9 @@ import difflib import logging import re -from typing import Any +from typing import TYPE_CHECKING, Any +from kedro.io import DataCatalog from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, @@ -26,6 +27,9 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler +if TYPE_CHECKING: + from types import UnionType + CREDENTIALS_KEY = "credentials" @@ -127,7 +131,6 @@ def from_config( def _init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save - # Add is_init property validate_dataset_config(ds_name, ds_config) ds = AbstractDataset.from_config( ds_name, @@ -291,3 +294,6 @@ def shallow_copy( if extra_dataset_patterns: self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self + + +AbstractDataCatalog: UnionType = DataCatalog | KedroDataCatalog From 31a9484a4035e88711f7dcc0fc01affebcf6174e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 15:40:32 +0100 Subject: [PATCH 066/173] Updated runners Signed-off-by: Elena Khaustova --- kedro/runner/parallel_runner.py | 20 ++++++----- kedro/runner/runner.py | 55 +++++++++++++++++-------------- kedro/runner/sequential_runner.py | 8 ++--- kedro/runner/thread_runner.py | 8 ++--- 4 files changed, 50 insertions(+), 41 deletions(-) diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 62d7e1216b..3418dbcd11 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -22,7 +22,7 @@ ) from kedro.framework.project import settings from kedro.io import ( - DataCatalog, + AbstractDataCatalog, DatasetNotFoundError, MemoryDataset, SharedMemoryDataset, @@ -60,7 +60,7 @@ def _bootstrap_subprocess( def _run_node_synchronization( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, is_async: bool = False, session_id: str | None = None, package_name: str | None = None, @@ -73,7 +73,7 @@ def _run_node_synchronization( # noqa: PLR0913 Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. session_id: The session id of the pipeline run. @@ -118,7 +118,7 @@ def __init__( cannot be larger than 61 and will be set to min(61, max_workers). is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog during the run. This is used to set the default datasets to SharedMemoryDataset for `ParallelRunner`. @@ -168,7 +168,9 @@ def _validate_nodes(cls, nodes: Iterable[Node]) -> None: ) @classmethod - def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline) -> None: + def _validate_catalog( + cls, catalog: AbstractDataCatalog, pipeline: Pipeline + ) -> None: """Ensure that all data sets are serialisable and that we do not have any non proxied memory data sets being used as outputs as their content will not be synchronized across threads. @@ -213,7 +215,9 @@ def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline) -> None: f"MemoryDatasets" ) - def _set_manager_datasets(self, catalog: DataCatalog, pipeline: Pipeline) -> None: + def _set_manager_datasets( + self, catalog: AbstractDataCatalog, pipeline: Pipeline + ) -> None: for dataset in pipeline.datasets(): try: catalog.exists(dataset) @@ -240,7 +244,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -248,7 +252,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 6f165e87c0..1763fd5036 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -21,7 +21,7 @@ from more_itertools import interleave from kedro.framework.hooks.manager import _NullPluginManager -from kedro.io import DataCatalog, MemoryDataset +from kedro.io import AbstractDataCatalog, MemoryDataset from kedro.pipeline import Pipeline if TYPE_CHECKING: @@ -45,7 +45,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog during the run. This is used to set the default datasets on the Runner instances. """ @@ -59,7 +59,7 @@ def _logger(self) -> logging.Logger: def run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager | None = None, session_id: str | None = None, ) -> dict[str, Any]: @@ -68,7 +68,7 @@ def run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -76,7 +76,7 @@ def run( ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: - Any node outputs that cannot be processed by the ``DataCatalog``. + Any node outputs that cannot be processed by the ``AbstractDataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. @@ -94,7 +94,7 @@ def run( if unsatisfied: raise ValueError( - f"Pipeline input(s) {unsatisfied} not found in the DataCatalog" + f"Pipeline input(s) {unsatisfied} not found in the AbstractDataCatalog" ) # Identify MemoryDataset in the catalog @@ -124,7 +124,10 @@ def run( return {ds_name: catalog.load(ds_name) for ds_name in free_outputs} def run_only_missing( - self, pipeline: Pipeline, catalog: DataCatalog, hook_manager: PluginManager + self, + pipeline: Pipeline, + catalog: AbstractDataCatalog, + hook_manager: PluginManager, ) -> dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the datasets provided by ``catalog``, and save results back to the @@ -132,7 +135,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be @@ -140,7 +143,7 @@ def run_only_missing( Returns: Any node outputs that cannot be processed by the - ``DataCatalog``. These are returned in a dictionary, where + ``AbstractDataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ @@ -164,7 +167,7 @@ def run_only_missing( def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -173,7 +176,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -184,7 +187,7 @@ def _suggest_resume_scenario( self, pipeline: Pipeline, done_nodes: Iterable[Node], - catalog: DataCatalog, + catalog: AbstractDataCatalog, ) -> None: """ Suggest a command to the user to resume a run after it fails. @@ -194,7 +197,7 @@ def _suggest_resume_scenario( Args: pipeline: the ``Pipeline`` of the run. done_nodes: the ``Node``s that executed successfully. - catalog: the ``DataCatalog`` of the run. + catalog: the ``AbstractDataCatalog`` of the run. """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) @@ -223,7 +226,7 @@ def _suggest_resume_scenario( def _find_nodes_to_resume_from( - pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: DataCatalog + pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: AbstractDataCatalog ) -> set[str]: """Given a collection of unfinished nodes in a pipeline using a certain catalog, find the node names to pass to pipeline.from_nodes() @@ -233,7 +236,7 @@ def _find_nodes_to_resume_from( Args: pipeline: the ``Pipeline`` to find starting nodes for. unfinished_nodes: collection of ``Node``s that have not finished yet - catalog: the ``DataCatalog`` of the run. + catalog: the ``AbstractDataCatalog`` of the run. Returns: Set of node names to pass to pipeline.from_nodes() to continue @@ -251,7 +254,7 @@ def _find_nodes_to_resume_from( def _find_all_nodes_for_resumed_pipeline( - pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: DataCatalog + pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: AbstractDataCatalog ) -> set[Node]: """Breadth-first search approach to finding the complete set of ``Node``s which need to run to cover all unfinished nodes, @@ -261,7 +264,7 @@ def _find_all_nodes_for_resumed_pipeline( Args: pipeline: the ``Pipeline`` to analyze. unfinished_nodes: the iterable of ``Node``s which have not finished yet. - catalog: the ``DataCatalog`` of the run. + catalog: the ``AbstractDataCatalog`` of the run. Returns: A set containing all input unfinished ``Node``s and all remaining @@ -309,12 +312,14 @@ def _nodes_with_external_inputs(nodes_of_interest: Iterable[Node]) -> set[Node]: return set(p_nodes_with_external_inputs.nodes) -def _enumerate_non_persistent_inputs(node: Node, catalog: DataCatalog) -> set[str]: +def _enumerate_non_persistent_inputs( + node: Node, catalog: AbstractDataCatalog +) -> set[str]: """Enumerate non-persistent input datasets of a ``Node``. Args: node: the ``Node`` to check the inputs of. - catalog: the ``DataCatalog`` of the run. + catalog: the ``AbstractDataCatalog`` of the run. Returns: Set of names of non-persistent inputs of given ``Node``. @@ -379,7 +384,7 @@ def _find_initial_node_group(pipeline: Pipeline, nodes: Iterable[Node]) -> list[ def run_node( node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, is_async: bool = False, session_id: str | None = None, @@ -388,7 +393,7 @@ def run_node( Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. @@ -422,7 +427,7 @@ def run_node( def _collect_inputs_from_hook( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -455,7 +460,7 @@ def _collect_inputs_from_hook( # noqa: PLR0913 def _call_node_run( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -486,7 +491,7 @@ def _call_node_run( # noqa: PLR0913 def _run_node_sequential( node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: @@ -533,7 +538,7 @@ def _run_node_sequential( def _run_node_async( node: Node, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 48dac3cd54..6412e84b5c 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import DataCatalog + from kedro.io import AbstractDataCatalog from kedro.pipeline import Pipeline @@ -34,7 +34,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog during the run. This is used to set the default datasets to MemoryDataset for `SequentialRunner`. @@ -48,7 +48,7 @@ def __init__( def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -56,7 +56,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index b4751a602a..26b08ef2b4 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import DataCatalog + from kedro.io import AbstractDataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -43,7 +43,7 @@ def __init__( is_async: If True, set to False, because `ThreadRunner` doesn't support loading and saving the node inputs and outputs asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog during the run. This is used to set the default datasets to MemoryDataset for `ThreadRunner`. @@ -87,7 +87,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: AbstractDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -95,7 +95,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``AbstractDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. From ced1b7a12ff09c243fbb337c64db317b2f754aeb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 15:54:22 +0100 Subject: [PATCH 067/173] Updated default catalog validation Signed-off-by: Elena Khaustova --- kedro/framework/project/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index a3248b9daf..16b58261a2 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -51,6 +51,12 @@ def validate( default_class = self.default(settings, self) for name in self.names: setting_value = getattr(settings, name) + # Allow using new KedroDataCatalog + if ( + default_class == "kedro.io.data_catalog.DataCatalog" + and setting_value == "kedro.io.data_catalog_redesign.KedroDataCatalog" + ): + continue if not issubclass(setting_value, default_class): raise ValidationError( f"Invalid value '{setting_value.__module__}.{setting_value.__qualname__}' " From 7f9b57649fd5f75cd0262085fe95501d2ccd75f4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 16:00:27 +0100 Subject: [PATCH 068/173] Updated default catalog validation Signed-off-by: Elena Khaustova --- kedro/framework/project/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index 16b58261a2..9427a40c06 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -53,8 +53,10 @@ def validate( setting_value = getattr(settings, name) # Allow using new KedroDataCatalog if ( - default_class == "kedro.io.data_catalog.DataCatalog" - and setting_value == "kedro.io.data_catalog_redesign.KedroDataCatalog" + f"{setting_value.__module__}.{setting_value.__qualname__}" + == "kedro.io.data_catalog_redesign.KedroDataCatalog" + and f"{default_class.__module__}.{default_class.__qualname__}" + == "kedro.io.data_catalog.DataCatalog" ): continue if not issubclass(setting_value, default_class): From a3828d9bffa53e1edd66f3625fcc5b9180026e77 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 16:21:08 +0100 Subject: [PATCH 069/173] Updated contains and added exists methods for KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 37 ++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index cb36649571..1540c62ce9 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -62,7 +62,7 @@ def __init__( self._init_dataset(ds_name, ds_config) if feed_dict: - self.add_feed_dict(feed_dict) + self.add_from_dict(feed_dict) @property def datasets(self) -> dict[str, Any]: @@ -84,13 +84,20 @@ def __iter__(self): def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get_dataset(ds_name) - def __contains__(self, ds_name: str) -> bool: - """Check if an item is in the catalog""" - return ds_name in self._datasets + def __contains__(self, dataset_name: str) -> bool: + """Check if an item is in the catalog as a materialised dataset or pattern""" + return ( + dataset_name in self._datasets + or self._config_resolver.match_pattern(dataset_name) is not None + ) def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + @classmethod def from_config( cls, @@ -188,10 +195,6 @@ def add( ) self._datasets[ds_name] = dataset - @property - def _logger(self) -> logging.Logger: - return logging.getLogger(__name__) - def list(self, regex_search: str | None = None) -> list[str]: """ List of all dataset names registered in the catalog. @@ -295,5 +298,23 @@ def shallow_copy( self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self + def exists(self, name: str) -> bool: + """Checks whether registered data set exists by calling its `exists()` + method. Raises a warning and returns False if `exists()` is not + implemented. + + Args: + name: A data set to be checked. + + Returns: + Whether the data set output exists. + + """ + try: + dataset = self._get_dataset(name) + except DatasetNotFoundError: + return False + return dataset.exists() + AbstractDataCatalog: UnionType = DataCatalog | KedroDataCatalog From 16610c4ecbe7d738f0a89d1b935985656a413ebb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 17:26:50 +0100 Subject: [PATCH 070/173] Fixed docs Signed-off-by: Elena Khaustova --- kedro/framework/project/__init__.py | 1 + kedro/io/data_catalog_redesign.py | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index 9427a40c06..a16e033528 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -52,6 +52,7 @@ def validate( for name in self.names: setting_value = getattr(settings, name) # Allow using new KedroDataCatalog + # TODO: remove with the old catalog if ( f"{setting_value.__module__}.{setting_value.__qualname__}" == "kedro.io.data_catalog_redesign.KedroDataCatalog" diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 1540c62ce9..bf61075fe9 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -11,7 +11,7 @@ import difflib import logging import re -from typing import TYPE_CHECKING, Any +from typing import Any, Union from kedro.io import DataCatalog from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns @@ -27,9 +27,6 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -if TYPE_CHECKING: - from types import UnionType - CREDENTIALS_KEY = "credentials" @@ -317,4 +314,4 @@ def exists(self, name: str) -> bool: return dataset.exists() -AbstractDataCatalog: UnionType = DataCatalog | KedroDataCatalog +AbstractDataCatalog: type = Union[DataCatalog, KedroDataCatalog] From 321affe9a01bfd5ddf136d0e03556a4e4b419f1e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 17:39:43 +0100 Subject: [PATCH 071/173] Fixing docs and lint Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index bf61075fe9..8208ca5c96 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -66,7 +66,7 @@ def datasets(self) -> dict[str, Any]: return copy.deepcopy(self._datasets) @datasets.setter - def datasets(self, value: Any): + def datasets(self, value: Any) -> None: raise AttributeError( "Operation not allowed! Please change datasets through configuration." ) @@ -75,7 +75,7 @@ def datasets(self, value: Any): def config_resolver(self) -> DataCatalogConfigResolver: return self._config_resolver - def __iter__(self): + def __iter__(self) -> AbstractDataset: yield from self._datasets.values() def __getitem__(self, ds_name: str) -> AbstractDataset: @@ -151,7 +151,7 @@ def get_dataset( ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) if ds_name not in self._datasets and ds_config is not None: - self._init_dataset(ds_name, ds_config) + self._init_dataset(ds_name, ds_config) # type: ignore[arg-type] dataset = self._datasets.get(ds_name, None) @@ -274,8 +274,8 @@ def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None dataset = ( ds_data if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) - ) # type: ignore[abstract] + else MemoryDataset(data=ds_data) # type: ignore[abstract] + ) self.add(ds_name, dataset, replace) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: @@ -314,4 +314,4 @@ def exists(self, name: str) -> bool: return dataset.exists() -AbstractDataCatalog: type = Union[DataCatalog, KedroDataCatalog] +AbstractDataCatalog = Union[DataCatalog, KedroDataCatalog] From ff25405f82005684af0d5d437ed85eac5a92f5da Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 17:44:58 +0100 Subject: [PATCH 072/173] Fixed docs Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 635a5220a0..3a8241547c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -133,6 +133,7 @@ "kedro.io.core.DatasetError", "kedro.io.core.Version", "kedro.io.data_catalog.DataCatalog", + "kedro.io.data_catalog_redesign.KedroDataCatalog", "kedro.io.memory_dataset.MemoryDataset", "kedro.io.partitioned_dataset.PartitionedDataset", "kedro.pipeline.pipeline.Pipeline", From d0000c0d043af7b5f2cdd4901517775cb3ef4e3d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 17:59:00 +0100 Subject: [PATCH 073/173] Fixed docs Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3a8241547c..a5fa1d0315 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -171,6 +171,7 @@ "None. Update D from mapping/iterable E and F.", "Patterns", "DataCatalogConfigResolver", + "AbstractDataCatalog", ), "py:data": ( "typing.Any", From 7f5ddec0b8e2505d4474005fc9d598ca2d933881 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 11:24:33 +0100 Subject: [PATCH 074/173] Fixed unit tests Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 4 ++-- tests/runner/test_sequential_runner.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 8208ca5c96..7df8ea57f3 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -103,9 +103,9 @@ def from_config( load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> KedroDataCatalog: - """Create a ``DataCatalog`` instance from configuration. This is a + """Create a ``KedroDataCatalog`` instance from configuration. This is a factory method used to provide developers with a way to instantiate - ``DataCatalog`` with configuration parsed from configuration files. + ``KedroDataCatalog`` with configuration parsed from configuration files. """ catalog = catalog or {} config_resolver = DataCatalogConfigResolver(catalog, credentials) diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index dbc73a30f0..e74bf6263d 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -130,7 +130,7 @@ def test_conflict_feed_catalog( def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline, catalog): """ds1, ds2 and ds3 were not specified.""" - with pytest.raises(ValueError, match=r"not found in the DataCatalog"): + with pytest.raises(ValueError, match=r"not found in the AbstractDataCatalog"): SequentialRunner(is_async=is_async).run( unfinished_outputs_pipeline, catalog ) From e030bb666e1aa825baf36812b33855c6784cc60f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 11:31:54 +0100 Subject: [PATCH 075/173] Added __eq__ Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 7df8ea57f3..17643ed3d2 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -27,8 +27,6 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -CREDENTIALS_KEY = "credentials" - def validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): @@ -88,6 +86,12 @@ def __contains__(self, dataset_name: str) -> bool: or self._config_resolver.match_pattern(dataset_name) is not None ) + def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] + return (self._datasets, self._config_resolver.list_patterns()) == ( + other.datasets, + other.config_resolver.list_patterns(), + ) + def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) From 6433dd8dc47e43a19181dfbfcf7d861dab51efaf Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 14:59:19 +0100 Subject: [PATCH 076/173] Renamed DataCatalogConfigResolver to CatalogConfigResolver Signed-off-by: Elena Khaustova --- docs/source/conf.py | 4 ++-- kedro/io/__init__.py | 4 ++-- kedro/io/catalog_config_resolver.py | 4 ++-- kedro/io/data_catalog.py | 12 ++++++------ tests/framework/session/test_session.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 635a5220a0..2c3a2c4c00 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -127,7 +127,7 @@ "typing.Type", "typing.Set", "kedro.config.config.ConfigLoader", - "kedro.io.catalog_config_resolver.DataCatalogConfigResolver", + "kedro.io.catalog_config_resolver.CatalogConfigResolver", "kedro.io.core.AbstractDataset", "kedro.io.core.AbstractVersionedDataset", "kedro.io.core.DatasetError", @@ -169,7 +169,7 @@ "D[k] if k in D, else d. d defaults to None.", "None. Update D from mapping/iterable E and F.", "Patterns", - "DataCatalogConfigResolver", + "CatalogConfigResolver", ), "py:data": ( "typing.Any", diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 5d17d6f058..4b4a2e1b52 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset -from .catalog_config_resolver import DataCatalogConfigResolver +from .catalog_config_resolver import CatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -24,7 +24,7 @@ "AbstractVersionedDataset", "CachedDataset", "DataCatalog", - "DataCatalogConfigResolver", + "CatalogConfigResolver", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 73aeb4a830..d2e6a14a4b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,4 +1,4 @@ -"""``DataCatalogConfigResolver`` resolves dataset configurations and datasets' +"""``CatalogConfigResolver`` resolves dataset configurations and datasets' patterns based on catalog configuration and credentials provided. """ @@ -93,7 +93,7 @@ def _resolve_dataset_config( return config -class DataCatalogConfigResolver: +class CatalogConfigResolver: """Resolves dataset configurations based on patterns and credentials.""" def __init__( diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7cd8c31690..f53a3ea2b9 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -16,7 +16,7 @@ from kedro.io.catalog_config_resolver import ( CREDENTIALS_KEY, # noqa: F401 - DataCatalogConfigResolver, + CatalogConfigResolver, Patterns, ) from kedro.io.core import ( @@ -116,7 +116,7 @@ def __init__( # noqa: PLR0913 default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: DataCatalogConfigResolver | None = None, + config_resolver: CatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -147,7 +147,7 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. - config_resolver: An instance of DataCatalogConfigResolver to resolve dataset patterns and configurations. + config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. Example: @@ -160,7 +160,7 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ - self._config_resolver = config_resolver or DataCatalogConfigResolver() + self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets: dict[str, AbstractDataset] = {} self.datasets: _FrozenDatasets | None = None @@ -191,7 +191,7 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] ) @property - def config_resolver(self) -> DataCatalogConfigResolver: + def config_resolver(self) -> CatalogConfigResolver: return self._config_resolver @property @@ -277,7 +277,7 @@ class to be loaded is specified with the key ``type`` and their """ catalog = catalog or {} datasets = {} - config_resolver = DataCatalogConfigResolver(catalog, credentials) + config_resolver = CatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index 3e2deb38ea..086d581045 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -693,7 +693,7 @@ def test_run_thread_runner( } mocker.patch("kedro.framework.session.session.pipelines", pipelines_ret) mocker.patch( - "kedro.io.data_catalog.DataCatalogConfigResolver.match_pattern", + "kedro.io.data_catalog.CatalogConfigResolver.match_pattern", return_value=match_pattern, ) From 355576f70b8f825e1ade838cc0aaf236fe8dcc29 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 15:40:39 +0100 Subject: [PATCH 077/173] Renamed _init_configs to _resolve_config_credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index d2e6a14a4b..ed539baad9 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -105,7 +105,7 @@ def __init__( self._dataset_patterns, self._default_pattern = self._extract_patterns( config, credentials ) - self._resolved_configs = self._init_configs(config, credentials) + self._resolved_configs = self._resolve_config_credentials(config, credentials) @property def config(self) -> dict[str, dict[str, Any]]: @@ -192,7 +192,7 @@ def _extract_patterns( return sorted_patterns, user_default - def _init_configs( + def _resolve_config_credentials( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, From 39d9ff681413a8edbf79dcd41fb46e1207dbaa89 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 16:02:40 +0100 Subject: [PATCH 078/173] Moved functions to the class Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 163 +++++++++++++++------------- 1 file changed, 85 insertions(+), 78 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index ed539baad9..a2d08f5f43 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -18,81 +18,6 @@ CREDENTIALS_KEY = "credentials" -def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Fetch the specified credentials from the provided credentials dictionary. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _resolve_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _fetch_credentials(value, credentials) - if isinstance(value, dict): - return {k: _resolve_value(k, v) for k, v in value.items()} - return value - - return {k: _resolve_value(k, v) for k, v in config.items()} - - -def _resolve_dataset_config( - ds_name: str, - pattern: str, - config: Any, -) -> Any: - """Resolve dataset configuration based on the provided pattern.""" - resolved_vars = parse(pattern, ds_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = _resolve_dataset_config(ds_name, pattern, value) - elif isinstance(config, (list, tuple)): - config = [_resolve_dataset_config(ds_name, pattern, value) for value in config] - elif isinstance(config, str) and "}" in config: - try: - config = config.format_map(resolved_vars.named) - except KeyError as exc: - raise KeyError( - f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - class CatalogConfigResolver: """Resolves dataset configurations based on patterns and credentials.""" @@ -153,6 +78,84 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: ) return {key: dataset_patterns[key] for key in sorted_keys} + @staticmethod + def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Fetch the specified credentials from the provided credentials dictionary. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + @classmethod + def _resolve_credentials( + cls, config: dict[str, Any], credentials: dict[str, Any] + ) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _resolve_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return cls._fetch_credentials(value, credentials) + if isinstance(value, dict): + return {k: _resolve_value(k, v) for k, v in value.items()} + return value + + return {k: _resolve_value(k, v) for k, v in config.items()} + + @classmethod + def _resolve_dataset_config( + cls, + ds_name: str, + pattern: str, + config: Any, + ) -> Any: + """Resolve dataset configuration based on the provided pattern.""" + resolved_vars = parse(pattern, ds_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = cls._resolve_dataset_config(ds_name, pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + cls._resolve_dataset_config(ds_name, pattern, value) for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = config.format_map(resolved_vars.named) + except KeyError as exc: + raise KeyError( + f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + def list_patterns(self) -> list[str]: """List al patterns available in the catalog.""" return ( @@ -181,7 +184,9 @@ def _extract_patterns( for ds_name, ds_config in config.items(): if cls.is_pattern(ds_name): - dataset_patterns[ds_name] = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = cls._resolve_credentials( + ds_config, credentials + ) sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -211,7 +216,9 @@ def _resolve_config_credentials( "make sure that the key is preceded by an underscore." ) if not self.is_pattern(ds_name): - resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) + resolved_configs[ds_name] = self._resolve_credentials( + ds_config, credentials + ) return resolved_configs @@ -233,7 +240,7 @@ def resolve_dataset_pattern( or self._runtime_patterns.get(matched_pattern) or {} ) - ds_config = _resolve_dataset_config( + ds_config = self._resolve_dataset_config( ds_name, matched_pattern, config_copy ) From 659c9daedf88d3993816c391f027cf5cd19ed1a4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 16:16:57 +0100 Subject: [PATCH 079/173] Refactored resolve_dataset_pattern Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index a2d08f5f43..99274095f1 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -170,6 +170,14 @@ def match_pattern(self, ds_name: str) -> str | None: matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) + def _get_pattern_config(self, pattern: str) -> dict[str, Any]: + return ( + self._dataset_patterns.get(pattern) + or self._default_pattern.get(pattern) + or self._runtime_patterns.get(pattern) + or {} + ) + @classmethod def _extract_patterns( cls, @@ -232,16 +240,9 @@ def resolve_dataset_pattern( for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) if matched_pattern and ds_name not in self._resolved_configs: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or self._runtime_patterns.get(matched_pattern) - or {} - ) + pattern_config = self._get_pattern_config(matched_pattern) ds_config = self._resolve_dataset_config( - ds_name, matched_pattern, config_copy + ds_name, matched_pattern, copy.deepcopy(pattern_config) ) if ( @@ -255,10 +256,8 @@ def resolve_dataset_pattern( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self._resolved_configs: - resolved_configs.append(self._resolved_configs.get(ds_name)) - else: - resolved_configs.append(None) + + resolved_configs.append(self._resolved_configs.get(ds_name, None)) return resolved_configs[0] if isinstance(datasets, str) else resolved_configs From 840b32a7d760bc69393b07921f59e3c7cad9c492 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:27:00 +0100 Subject: [PATCH 080/173] Fixed refactored part Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 99274095f1..a17b4725a5 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -256,8 +256,8 @@ def resolve_dataset_pattern( ds_name, ) resolved_configs.append(ds_config) - - resolved_configs.append(self._resolved_configs.get(ds_name, None)) + else: + resolved_configs.append(self._resolved_configs.get(ds_name, None)) return resolved_configs[0] if isinstance(datasets, str) else resolved_configs From 77f551c3c4a3a3e3f099f82e519a6c8efc372c8d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:27:34 +0100 Subject: [PATCH 081/173] Changed the order of arguments for DataCatalog constructor Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f53a3ea2b9..a195c9e47a 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -113,9 +113,9 @@ def __init__( # noqa: PLR0913 datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, dataset_patterns: Patterns | None = None, # Kept for interface compatibility - default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, + default_pattern: Patterns | None = None, # Kept for interface compatibility config_resolver: CatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` @@ -307,9 +307,9 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, dataset_patterns=config_resolver._dataset_patterns, - default_pattern=config_resolver._default_pattern, load_versions=load_versions, save_version=save_version, + default_pattern=config_resolver._default_pattern, config_resolver=config_resolver, ) From 6e079a1194f9a6800ba3c59f59bdebea6b865d9f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:29:32 +0100 Subject: [PATCH 082/173] Replaced __getitem__ with .get() Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a195c9e47a..97111e22a9 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -122,7 +122,7 @@ def __init__( # noqa: PLR0913 implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a - single point of reference for your calls, relaying load and save§ + single point of reference for your calls, relaying load and save functions to the underlying data sets. Args: @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name], + config_resolver.config.get(ds_name), load_versions.get(ds_name), save_version, ) From 1f7e5f88476a6bd7c8cd92bea151f945f4e2c797 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:39:45 +0100 Subject: [PATCH 083/173] Updated catalog commands Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index f6a58664fb..4001b696f3 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections import defaultdict -from itertools import chain +from itertools import chain, filterfalse from typing import TYPE_CHECKING, Any import click @@ -126,11 +126,10 @@ def _map_type_to_datasets( datasets of the specific type as a value. """ mapping = defaultdict(list) # type: ignore[var-annotated] - for dataset_name in datasets: - if not is_parameter(dataset_name): - ds_type = datasets_meta[dataset_name].__class__.__name__ - if dataset_name not in mapping[ds_type]: - mapping[ds_type].append(dataset_name) + for dataset_name in filterfalse(is_parameter, datasets): + ds_type = datasets_meta[dataset_name].__class__.__name__ + if dataset_name not in mapping[ds_type]: + mapping[ds_type].append(dataset_name) return mapping @@ -167,13 +166,9 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name: str, env: str) -> N f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}" ) - pipeline_datasets = { - ds_name for ds_name in pipeline.datasets() if not is_parameter(ds_name) - } + pipeline_datasets = set(filterfalse(is_parameter, pipeline.datasets())) - catalog_datasets = { - ds_name for ds_name in context.catalog.list() if not is_parameter(ds_name) - } + catalog_datasets = set(filterfalse(is_parameter, context.catalog.list())) # Datasets that are missing in Data Catalog missing_ds = sorted(pipeline_datasets - catalog_datasets) From 80f0e3d938201527a73e7fed2dbf599148afffb7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:44:48 +0100 Subject: [PATCH 084/173] Moved warm up block outside of the try block Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 2b13cd1694..caa3553954 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -394,11 +394,11 @@ def run( # noqa: PLR0913 run_params=record_data, pipeline=filtered_pipeline, catalog=catalog ) + if isinstance(runner, ThreadRunner): + for ds in filtered_pipeline.datasets(): + if catalog.config_resolver.match_pattern(ds): + _ = catalog._get_dataset(ds) try: - if isinstance(runner, ThreadRunner): - for ds in filtered_pipeline.datasets(): - if catalog.config_resolver.match_pattern(ds): - _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) From 017cda3ecc562cf21b9f94fd8b008e3eea73da79 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:59:51 +0100 Subject: [PATCH 085/173] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 97111e22a9..f025c52190 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config.get(ds_name), + config_resolver.config.get(ds_name, {}), load_versions.get(ds_name), save_version, ) From cab6f06e2fc33cdd3a030ff17f80b887e6f1d72a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 18:06:52 +0100 Subject: [PATCH 086/173] Removed odd copying Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 9 ++++----- kedro/io/data_catalog.py | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index a17b4725a5..f3548e4dd5 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -185,8 +185,8 @@ def _extract_patterns( credentials: dict[str, dict[str, Any]] | None, ) -> tuple[Patterns, Patterns]: """Extract and sort patterns from the configuration.""" - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} + config = config or {} + credentials = credentials or {} dataset_patterns = {} user_default = {} @@ -211,9 +211,8 @@ def _resolve_config_credentials( credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" - # TODO: check if deep copies are required - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} + config = config or {} + credentials = credentials or {} resolved_configs = {} for ds_name, ds_config in config.items(): diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f025c52190..2b09c35e80 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -7,7 +7,6 @@ from __future__ import annotations -import copy import difflib import logging import pprint @@ -279,7 +278,7 @@ class to be loaded is specified with the key ``type`` and their datasets = {} config_resolver = CatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() - load_versions = copy.deepcopy(load_versions) or {} + load_versions = load_versions or {} for ds_name in catalog: if not config_resolver.is_pattern(ds_name): From e955930bbe175cc2480d029bf8f9984eba5c176c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 18:50:44 +0100 Subject: [PATCH 087/173] Renamed DataCatalogConfigResolver to CatalogConfigResolver Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 17643ed3d2..a50c9cda5e 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -14,7 +14,7 @@ from typing import Any, Union from kedro.io import DataCatalog -from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns +from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -44,9 +44,9 @@ def __init__( feed_dict: dict[str, Any] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: DataCatalogConfigResolver | None = None, + config_resolver: CatalogConfigResolver | None = None, ) -> None: - self._config_resolver = config_resolver or DataCatalogConfigResolver() + self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} self._load_versions = load_versions or {} self._save_version = save_version @@ -70,7 +70,7 @@ def datasets(self, value: Any) -> None: ) @property - def config_resolver(self) -> DataCatalogConfigResolver: + def config_resolver(self) -> CatalogConfigResolver: return self._config_resolver def __iter__(self) -> AbstractDataset: @@ -112,7 +112,7 @@ def from_config( ``KedroDataCatalog`` with configuration parsed from configuration files. """ catalog = catalog or {} - config_resolver = DataCatalogConfigResolver(catalog, credentials) + config_resolver = CatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() load_versions = load_versions or {} From a07f3d417a9d2e686081890667a2692bfdee0acd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 18:53:21 +0100 Subject: [PATCH 088/173] Renamed AbstractDataCatalog to BaseDataCatalog Signed-off-by: Elena Khaustova --- docs/source/conf.py | 2 +- kedro/framework/context/context.py | 16 ++++---- kedro/framework/hooks/specs.py | 28 +++++++------- kedro/io/__init__.py | 4 +- kedro/io/data_catalog_redesign.py | 2 +- kedro/runner/parallel_runner.py | 18 ++++----- kedro/runner/runner.py | 52 +++++++++++++------------- kedro/runner/sequential_runner.py | 8 ++-- kedro/runner/thread_runner.py | 8 ++-- tests/runner/test_sequential_runner.py | 2 +- 10 files changed, 68 insertions(+), 72 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d4361c33cd..188828d7e3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -171,7 +171,7 @@ "None. Update D from mapping/iterable E and F.", "Patterns", "CatalogConfigResolver", - "AbstractDataCatalog", + "BaseDataCatalog", ), "py:data": ( "typing.Any", diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 245c64e0b9..15825a4ec9 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -14,7 +14,7 @@ from kedro.config import AbstractConfigLoader, MissingConfigException from kedro.framework.project import settings -from kedro.io import AbstractDataCatalog, DataCatalog # noqa: TCH001 +from kedro.io import BaseDataCatalog, DataCatalog # noqa: TCH001 from kedro.pipeline.transcoding import _transcode_split if TYPE_CHECKING: @@ -123,7 +123,7 @@ def _convert_paths_to_absolute_posix( return conf_dictionary -def _validate_transcoded_datasets(catalog: AbstractDataCatalog) -> None: +def _validate_transcoded_datasets(catalog: BaseDataCatalog) -> None: """Validates transcoded datasets are correctly named Args: @@ -178,13 +178,13 @@ class KedroContext: ) @property - def catalog(self) -> AbstractDataCatalog: - """Read-only property referring to Kedro's ``AbstractDataCatalog`` for this context. + def catalog(self) -> BaseDataCatalog: + """Read-only property referring to Kedro's ``BaseDataCatalog`` for this context. Returns: DataCatalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``AbstractDataCatalog`` registered for the project. + KedroContextError: Incorrect ``BaseDataCatalog`` registered for the project. """ return self._get_catalog() @@ -213,13 +213,13 @@ def _get_catalog( self, save_version: str | None = None, load_versions: dict[str, str] | None = None, - ) -> AbstractDataCatalog: - """A hook for changing the creation of a AbstractDataCatalog instance. + ) -> BaseDataCatalog: + """A hook for changing the creation of a BaseDataCatalog instance. Returns: DataCatalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``AbstractDataCatalog`` registered for the project. + KedroContextError: Incorrect ``BaseDataCatalog`` registered for the project. """ # '**/catalog*' reads modular pipeline configs diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 0a7f0c9295..39fc0bcaf9 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from kedro.framework.context import KedroContext - from kedro.io import AbstractDataCatalog + from kedro.io import BaseDataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -22,7 +22,7 @@ class DataCatalogSpecs: @hook_spec def after_catalog_created( # noqa: PLR0913 self, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, conf_catalog: dict[str, Any], conf_creds: dict[str, Any], feed_dict: dict[str, Any], @@ -53,7 +53,7 @@ class NodeSpecs: def before_node_run( self, node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -63,7 +63,7 @@ def before_node_run( Args: node: The ``Node`` to run. - catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. + catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -81,7 +81,7 @@ def before_node_run( def after_node_run( # noqa: PLR0913 self, node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, inputs: dict[str, Any], outputs: dict[str, Any], is_async: bool, @@ -93,7 +93,7 @@ def after_node_run( # noqa: PLR0913 Args: node: The ``Node`` that ran. - catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. + catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -110,7 +110,7 @@ def on_node_error( # noqa: PLR0913 self, error: Exception, node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -122,7 +122,7 @@ def on_node_error( # noqa: PLR0913 Args: error: The uncaught exception thrown during the node run. node: The ``Node`` to run. - catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. + catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -140,7 +140,7 @@ def before_pipeline_run( self, run_params: dict[str, Any], pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, ) -> None: """Hook to be invoked before a pipeline runs. @@ -167,7 +167,7 @@ def before_pipeline_run( } pipeline: The ``Pipeline`` that will be run. - catalog: The ``AbstractDataCatalog`` to be used during the run. + catalog: The ``BaseDataCatalog`` to be used during the run. """ pass @@ -177,7 +177,7 @@ def after_pipeline_run( run_params: dict[str, Any], run_result: dict[str, Any], pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, ) -> None: """Hook to be invoked after a pipeline runs. @@ -205,7 +205,7 @@ def after_pipeline_run( run_result: The output of ``Pipeline`` run. pipeline: The ``Pipeline`` that was run. - catalog: The ``AbstractDataCatalog`` used during the run. + catalog: The ``BaseDataCatalog`` used during the run. """ pass @@ -215,7 +215,7 @@ def on_pipeline_error( error: Exception, run_params: dict[str, Any], pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, ) -> None: """Hook to be invoked if a pipeline run throws an uncaught Exception. The signature of this error hook should match the signature of ``before_pipeline_run`` @@ -245,7 +245,7 @@ def on_pipeline_error( } pipeline: The ``Pipeline`` that will was run. - catalog: The ``AbstractDataCatalog`` used during the run. + catalog: The ``BaseDataCatalog`` used during the run. """ pass diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index f6a1f5a165..1f45ec7bcd 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -15,14 +15,14 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog +from .data_catalog_redesign import BaseDataCatalog, KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset __all__ = [ "AbstractDataset", - "AbstractDataCatalog", + "BaseDataCatalog", "AbstractVersionedDataset", "CachedDataset", "DataCatalog", diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index a50c9cda5e..5ac0ec4869 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -318,4 +318,4 @@ def exists(self, name: str) -> bool: return dataset.exists() -AbstractDataCatalog = Union[DataCatalog, KedroDataCatalog] +BaseDataCatalog = Union[DataCatalog, KedroDataCatalog] diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 3418dbcd11..586df02c4b 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -22,7 +22,7 @@ ) from kedro.framework.project import settings from kedro.io import ( - AbstractDataCatalog, + BaseDataCatalog, DatasetNotFoundError, MemoryDataset, SharedMemoryDataset, @@ -60,7 +60,7 @@ def _bootstrap_subprocess( def _run_node_synchronization( # noqa: PLR0913 node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, is_async: bool = False, session_id: str | None = None, package_name: str | None = None, @@ -73,7 +73,7 @@ def _run_node_synchronization( # noqa: PLR0913 Args: node: The ``Node`` to run. - catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. + catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. session_id: The session id of the pipeline run. @@ -118,7 +118,7 @@ def __init__( cannot be larger than 61 and will be set to min(61, max_workers). is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the BaseDataCatalog during the run. This is used to set the default datasets to SharedMemoryDataset for `ParallelRunner`. @@ -168,9 +168,7 @@ def _validate_nodes(cls, nodes: Iterable[Node]) -> None: ) @classmethod - def _validate_catalog( - cls, catalog: AbstractDataCatalog, pipeline: Pipeline - ) -> None: + def _validate_catalog(cls, catalog: BaseDataCatalog, pipeline: Pipeline) -> None: """Ensure that all data sets are serialisable and that we do not have any non proxied memory data sets being used as outputs as their content will not be synchronized across threads. @@ -216,7 +214,7 @@ def _validate_catalog( ) def _set_manager_datasets( - self, catalog: AbstractDataCatalog, pipeline: Pipeline + self, catalog: BaseDataCatalog, pipeline: Pipeline ) -> None: for dataset in pipeline.datasets(): try: @@ -244,7 +242,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -252,7 +250,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 1763fd5036..2ace73a10c 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -21,7 +21,7 @@ from more_itertools import interleave from kedro.framework.hooks.manager import _NullPluginManager -from kedro.io import AbstractDataCatalog, MemoryDataset +from kedro.io import BaseDataCatalog, MemoryDataset from kedro.pipeline import Pipeline if TYPE_CHECKING: @@ -45,7 +45,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the BaseDataCatalog during the run. This is used to set the default datasets on the Runner instances. """ @@ -59,7 +59,7 @@ def _logger(self) -> logging.Logger: def run( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager | None = None, session_id: str | None = None, ) -> dict[str, Any]: @@ -68,7 +68,7 @@ def run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -76,7 +76,7 @@ def run( ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: - Any node outputs that cannot be processed by the ``AbstractDataCatalog``. + Any node outputs that cannot be processed by the ``BaseDataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. @@ -94,7 +94,7 @@ def run( if unsatisfied: raise ValueError( - f"Pipeline input(s) {unsatisfied} not found in the AbstractDataCatalog" + f"Pipeline input(s) {unsatisfied} not found in the BaseDataCatalog" ) # Identify MemoryDataset in the catalog @@ -126,7 +126,7 @@ def run( def run_only_missing( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, ) -> dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the @@ -135,7 +135,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be @@ -143,7 +143,7 @@ def run_only_missing( Returns: Any node outputs that cannot be processed by the - ``AbstractDataCatalog``. These are returned in a dictionary, where + ``BaseDataCatalog``. These are returned in a dictionary, where the keys are defined by the node outputs. """ @@ -167,7 +167,7 @@ def run_only_missing( def _run( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -176,7 +176,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -187,7 +187,7 @@ def _suggest_resume_scenario( self, pipeline: Pipeline, done_nodes: Iterable[Node], - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, ) -> None: """ Suggest a command to the user to resume a run after it fails. @@ -197,7 +197,7 @@ def _suggest_resume_scenario( Args: pipeline: the ``Pipeline`` of the run. done_nodes: the ``Node``s that executed successfully. - catalog: the ``AbstractDataCatalog`` of the run. + catalog: the ``BaseDataCatalog`` of the run. """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) @@ -226,7 +226,7 @@ def _suggest_resume_scenario( def _find_nodes_to_resume_from( - pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: AbstractDataCatalog + pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: BaseDataCatalog ) -> set[str]: """Given a collection of unfinished nodes in a pipeline using a certain catalog, find the node names to pass to pipeline.from_nodes() @@ -236,7 +236,7 @@ def _find_nodes_to_resume_from( Args: pipeline: the ``Pipeline`` to find starting nodes for. unfinished_nodes: collection of ``Node``s that have not finished yet - catalog: the ``AbstractDataCatalog`` of the run. + catalog: the ``BaseDataCatalog`` of the run. Returns: Set of node names to pass to pipeline.from_nodes() to continue @@ -254,7 +254,7 @@ def _find_nodes_to_resume_from( def _find_all_nodes_for_resumed_pipeline( - pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: AbstractDataCatalog + pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: BaseDataCatalog ) -> set[Node]: """Breadth-first search approach to finding the complete set of ``Node``s which need to run to cover all unfinished nodes, @@ -264,7 +264,7 @@ def _find_all_nodes_for_resumed_pipeline( Args: pipeline: the ``Pipeline`` to analyze. unfinished_nodes: the iterable of ``Node``s which have not finished yet. - catalog: the ``AbstractDataCatalog`` of the run. + catalog: the ``BaseDataCatalog`` of the run. Returns: A set containing all input unfinished ``Node``s and all remaining @@ -312,14 +312,12 @@ def _nodes_with_external_inputs(nodes_of_interest: Iterable[Node]) -> set[Node]: return set(p_nodes_with_external_inputs.nodes) -def _enumerate_non_persistent_inputs( - node: Node, catalog: AbstractDataCatalog -) -> set[str]: +def _enumerate_non_persistent_inputs(node: Node, catalog: BaseDataCatalog) -> set[str]: """Enumerate non-persistent input datasets of a ``Node``. Args: node: the ``Node`` to check the inputs of. - catalog: the ``AbstractDataCatalog`` of the run. + catalog: the ``BaseDataCatalog`` of the run. Returns: Set of names of non-persistent inputs of given ``Node``. @@ -384,7 +382,7 @@ def _find_initial_node_group(pipeline: Pipeline, nodes: Iterable[Node]) -> list[ def run_node( node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, is_async: bool = False, session_id: str | None = None, @@ -393,7 +391,7 @@ def run_node( Args: node: The ``Node`` to run. - catalog: A ``AbstractDataCatalog`` containing the node's inputs and outputs. + catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. @@ -427,7 +425,7 @@ def run_node( def _collect_inputs_from_hook( # noqa: PLR0913 node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -460,7 +458,7 @@ def _collect_inputs_from_hook( # noqa: PLR0913 def _call_node_run( # noqa: PLR0913 node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -491,7 +489,7 @@ def _call_node_run( # noqa: PLR0913 def _run_node_sequential( node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: @@ -538,7 +536,7 @@ def _run_node_sequential( def _run_node_async( node: Node, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 6412e84b5c..165b3a06ba 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import AbstractDataCatalog + from kedro.io import BaseDataCatalog from kedro.pipeline import Pipeline @@ -34,7 +34,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the BaseDataCatalog during the run. This is used to set the default datasets to MemoryDataset for `SequentialRunner`. @@ -48,7 +48,7 @@ def __init__( def _run( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -56,7 +56,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index 26b08ef2b4..ed7746c7a6 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import AbstractDataCatalog + from kedro.io import BaseDataCatalog from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -43,7 +43,7 @@ def __init__( is_async: If True, set to False, because `ThreadRunner` doesn't support loading and saving the node inputs and outputs asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the AbstractDataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the BaseDataCatalog during the run. This is used to set the default datasets to MemoryDataset for `ThreadRunner`. @@ -87,7 +87,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: AbstractDataCatalog, + catalog: BaseDataCatalog, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -95,7 +95,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``AbstractDataCatalog`` from which to fetch data. + catalog: The ``BaseDataCatalog`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index e74bf6263d..c8f54f7337 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -130,7 +130,7 @@ def test_conflict_feed_catalog( def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline, catalog): """ds1, ds2 and ds3 were not specified.""" - with pytest.raises(ValueError, match=r"not found in the AbstractDataCatalog"): + with pytest.raises(ValueError, match=r"not found in the BaseDataCatalog"): SequentialRunner(is_async=is_async).run( unfinished_outputs_pipeline, catalog ) From 4ecb8263c85fab6b33430a5b810f1469b2931cae Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 18:57:28 +0100 Subject: [PATCH 089/173] Moved validate_dataset_config inside catalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 5ac0ec4869..855edc0853 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -28,15 +28,6 @@ from kedro.utils import _format_rich, _has_rich_handler -def validate_dataset_config(ds_name: str, ds_config: Any) -> None: - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - - class KedroDataCatalog: def __init__( self, @@ -136,10 +127,19 @@ def from_config( config_resolver=config_resolver, ) + @staticmethod + def _validate_dataset_config(ds_name: str, ds_config: Any) -> None: + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + def _init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save - validate_dataset_config(ds_name, ds_config) + self._validate_dataset_config(ds_name, ds_config) ds = AbstractDataset.from_config( ds_name, ds_config, From 2b9be66144b0729928cb192cff6823916fc70664 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 19:00:11 +0100 Subject: [PATCH 090/173] Renamed _init_dataset to _add_from_config Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 855edc0853..ab7f5c8625 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -45,7 +45,7 @@ def __init__( self._use_rich_markup = _has_rich_handler() for ds_name, ds_config in self._config_resolver.config.items(): - self._init_dataset(ds_name, ds_config) + self._add_from_config(ds_name, ds_config) if feed_dict: self.add_from_dict(feed_dict) @@ -136,7 +136,7 @@ def _validate_dataset_config(ds_name: str, ds_config: Any) -> None: "make sure that the key is preceded by an underscore." ) - def _init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: + def _add_from_config(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save self._validate_dataset_config(ds_name, ds_config) From fb3831be1930813abf8433a5ace9bf70e5e1738c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 19:11:37 +0100 Subject: [PATCH 091/173] Fix lint Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index ab7f5c8625..375ba6bcaf 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -155,7 +155,7 @@ def get_dataset( ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) if ds_name not in self._datasets and ds_config is not None: - self._init_dataset(ds_name, ds_config) # type: ignore[arg-type] + self._add_from_config(ds_name, ds_config) # type: ignore[arg-type] dataset = self._datasets.get(ds_name, None) From 8f604d1da1428210b1792585bb1ab57a5de9b5b6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 10:47:44 +0100 Subject: [PATCH 092/173] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 34e75ffb74..548b49a109 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,8 @@ # Upcoming Release ## Major features and improvements +* Refactored `kedro run` and `kedro catalog` commands. +* Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally. * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment. * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking. ## Bug fixes and other changes From 9a4db1858473294c6439687359e5368d06a1a2e1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 13:53:06 +0100 Subject: [PATCH 093/173] Returned DatasetError Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- tests/io/test_data_catalog.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index f3548e4dd5..ab679f1e4b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -73,7 +73,7 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: pattern for pattern in sorted_keys if cls._pattern_specificity(pattern) == 0 ] if len(catch_all) > 1: - raise ValueError( + raise DatasetError( f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." ) return {key: dataset_patterns[key] for key in sorted_keys} @@ -150,7 +150,7 @@ def _resolve_dataset_config( try: config = config.format_map(resolved_vars.named) except KeyError as exc: - raise KeyError( + raise DatasetError( f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " f"should be present in the dataset factory pattern." ) from exc diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index be8ed0831e..db777cc634 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -925,7 +925,7 @@ def test_multiple_catch_all_patterns_not_allowed( } with pytest.raises( - ValueError, match="Multiple catch-all patterns found in the catalog" + DatasetError, match="Multiple catch-all patterns found in the catalog" ): DataCatalog.from_config(**config_with_dataset_factories) @@ -1019,7 +1019,7 @@ def test_unmatched_key_error_when_parsing_config( "Unable to resolve 'data/01_raw/{brand}_plane.pq' from the pattern '{type}@planes'. " "Keys used in the configuration should be present in the dataset factory pattern." ) - with pytest.raises(KeyError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): catalog._get_dataset("jet@planes") def test_factory_config_versioned( From 0a6946ab4032ffb51c74c93e1bd35784cecddbb9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 14:13:05 +0100 Subject: [PATCH 094/173] Added _dataset_patterns and _default_pattern to _config_resolver to avoid breaking change Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 2b09c35e80..475c18a148 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -160,6 +160,12 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or CatalogConfigResolver() + + # Kept to avoid breaking changes + if not config_resolver: + self._config_resolver._dataset_patterns = dataset_patterns or {} + self._config_resolver._default_pattern = default_pattern or {} + self._datasets: dict[str, AbstractDataset] = {} self.datasets: _FrozenDatasets | None = None From fee7bd6d662045572d09a98a3038d3e39671ec6d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 15:26:12 +0100 Subject: [PATCH 095/173] Made resolve_dataset_pattern return just dict Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 ++++---- kedro/io/catalog_config_resolver.py | 47 ++++++++++++----------------- kedro/io/data_catalog.py | 4 +-- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 4001b696f3..7bd0197e5b 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -93,12 +93,12 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: # resolve any factory datasets in the pipeline factory_ds_by_type = defaultdict(list) - resolved_configs = data_catalog.config_resolver.resolve_dataset_pattern( - default_ds - ) - for ds_name, ds_config in zip(default_ds, resolved_configs): + for ds_name in default_ds: if data_catalog.config_resolver.match_pattern(ds_name): - factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( # type: ignore[attr-defined] + ds_config = data_catalog.config_resolver.resolve_dataset_pattern( + ds_name + ) + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( ds_name ) @@ -253,7 +253,7 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: ds_config = data_catalog.config_resolver.resolve_dataset_pattern(ds_name) # Exclude MemoryDatasets not set in the catalog explicitly - if ds_config is not None: + if ds_config: explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index ab679f1e4b..91218d030c 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -229,36 +229,29 @@ def _resolve_config_credentials( return resolved_configs - def resolve_dataset_pattern( - self, datasets: str | list[str] - ) -> dict[str, Any] | list[dict[str, Any]]: + def resolve_dataset_pattern(self, ds_name: str) -> dict[str, Any]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" - datasets_lst = [datasets] if isinstance(datasets, str) else datasets - resolved_configs = [] - - for ds_name in datasets_lst: - matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._resolved_configs: - pattern_config = self._get_pattern_config(matched_pattern) - ds_config = self._resolve_dataset_config( - ds_name, matched_pattern, copy.deepcopy(pattern_config) + matched_pattern = self.match_pattern(ds_name) + + if matched_pattern and ds_name not in self._resolved_configs: + pattern_config = self._get_pattern_config(matched_pattern) + ds_config = self._resolve_dataset_config( + ds_name, matched_pattern, copy.deepcopy(pattern_config) + ) + + if ( + self._pattern_specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, ) + return ds_config - if ( - self._pattern_specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - else: - resolved_configs.append(self._resolved_configs.get(ds_name, None)) - - return resolved_configs[0] if isinstance(datasets, str) else resolved_configs + return self._resolved_configs.get(ds_name, {}) def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: """Add new runtime patterns and re-sort them.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 475c18a148..420f8857c8 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -326,10 +326,10 @@ def _get_dataset( ) -> AbstractDataset: ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) - if dataset_name not in self._datasets and ds_config is not None: + if dataset_name not in self._datasets and ds_config: ds = AbstractDataset.from_config( dataset_name, - ds_config, # type: ignore[arg-type] + ds_config, self._load_versions.get(dataset_name), self._save_version, ) From f5a7992a57b5407da8504be84eaf2a6888bce84f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 15:34:40 +0100 Subject: [PATCH 096/173] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 91218d030c..97ffbadd5f 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -249,7 +249,7 @@ def resolve_dataset_pattern(self, ds_name: str) -> dict[str, Any]: matched_pattern, ds_name, ) - return ds_config + return ds_config # type: ignore[no-any-return] return self._resolved_configs.get(ds_name, {}) From 1c981f33033243d0205c58df1ea3d807337bc0aa Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 15:55:24 +0100 Subject: [PATCH 097/173] Added Catalogprotocol draft Signed-off-by: Elena Khaustova --- kedro/io/core.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f3975c9c3c..5a62ead381 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -17,7 +17,7 @@ from glob import iglob from operator import attrgetter from pathlib import Path, PurePath, PurePosixPath -from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar +from typing import TYPE_CHECKING, Any, Callable, Generic, Protocol, TypeVar from urllib.parse import urlsplit from cachetools import Cache, cachedmethod @@ -29,6 +29,8 @@ if TYPE_CHECKING: import os + from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns + VERSION_FORMAT = "%Y-%m-%dT%H.%M.%S.%fZ" VERSIONED_FLAG_KEY = "versioned" VERSION_KEY = "version" @@ -871,3 +873,67 @@ def validate_on_forbidden_chars(**kwargs: Any) -> None: raise DatasetError( f"Neither white-space nor semicolon are allowed in '{key}'." ) + + +class CatalogProtocol(Protocol): + def __contains__(self, ds_name: str) -> bool: + """Check if a dataset is in the catalog.""" + ... + + @property + def config_resolver(self) -> CatalogConfigResolver: + """Return a copy of the datasets dictionary.""" + ... + + @classmethod + def from_config(cls, catalog: dict[str, dict[str, Any]] | None) -> Any: + """Create a ``KedroDataCatalog`` instance from configuration.""" + ... + + def _get_dataset( + self, ds_name: str, suggest: bool = True, version: Any = None | None + ) -> Any: + """Retrieve a dataset by its name.""" + ... + + def list(self, regex_search: str = None | None) -> list[str]: + """List all dataset names registered in the catalog.""" + ... + + def save(self, name: str, data: Any) -> None: + """Save data to a registered dataset.""" + ... + + def load(self, name: str, version: str = None | None) -> Any: + """Load data from a registered dataset.""" + ... + + def add(self, ds_name: str, dataset: Any, replace: bool = False) -> None: + """Add a new dataset to the catalog.""" + ... + + def add_all(self, datasets: dict[str, Any], replace: bool = False) -> None: + """Add a new dataset to the catalog.""" + ... + + def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + """Add datasets to the catalog using the data provided through the `feed_dict`.""" + ... + + def exists(self, name: str) -> bool: + """Checks whether registered data set exists by calling its `exists()` method.""" + pass + + def release(self, name: str) -> None: + """Release any cached data associated with a dataset.""" + ... + + def confirm(self, name: str) -> None: + """Confirm a dataset by its name.""" + ... + + def shallow_copy( + self, extra_dataset_patterns: Patterns | None = None + ) -> CatalogProtocol: + """Returns a shallow copy of the current object.""" + ... From 6128be72ce887932c2cc7728f69c51057bd6dfb3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 11:41:04 +0100 Subject: [PATCH 098/173] Implemented CatalogProtocol Signed-off-by: Elena Khaustova --- kedro/io/core.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 5a62ead381..ecdb85d505 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -17,7 +17,15 @@ from glob import iglob from operator import attrgetter from pathlib import Path, PurePath, PurePosixPath -from typing import TYPE_CHECKING, Any, Callable, Generic, Protocol, TypeVar +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Generic, + Protocol, + TypeVar, + runtime_checkable, +) from urllib.parse import urlsplit from cachetools import Cache, cachedmethod @@ -875,7 +883,13 @@ def validate_on_forbidden_chars(**kwargs: Any) -> None: ) -class CatalogProtocol(Protocol): +_C = TypeVar("_C") + + +@runtime_checkable +class CatalogProtocol(Protocol["_C"]): + _datasets: dict[str, AbstractDataset] + def __contains__(self, ds_name: str) -> bool: """Check if a dataset is in the catalog.""" ... @@ -886,17 +900,17 @@ def config_resolver(self) -> CatalogConfigResolver: ... @classmethod - def from_config(cls, catalog: dict[str, dict[str, Any]] | None) -> Any: + def from_config(cls, catalog: dict[str, dict[str, Any]] | None) -> _C: """Create a ``KedroDataCatalog`` instance from configuration.""" ... def _get_dataset( - self, ds_name: str, suggest: bool = True, version: Any = None | None - ) -> Any: + self, ds_name: str, suggest: bool = True, version: Any = None + ) -> AbstractDataset: """Retrieve a dataset by its name.""" ... - def list(self, regex_search: str = None | None) -> list[str]: + def list(self, regex_search: str | None = None) -> list[str]: """List all dataset names registered in the catalog.""" ... @@ -904,7 +918,7 @@ def save(self, name: str, data: Any) -> None: """Save data to a registered dataset.""" ... - def load(self, name: str, version: str = None | None) -> Any: + def load(self, name: str, version: str | None = None) -> _DO: """Load data from a registered dataset.""" ... @@ -932,8 +946,6 @@ def confirm(self, name: str) -> None: """Confirm a dataset by its name.""" ... - def shallow_copy( - self, extra_dataset_patterns: Patterns | None = None - ) -> CatalogProtocol: + def shallow_copy(self, extra_dataset_patterns: Patterns | None = None) -> _C: """Returns a shallow copy of the current object.""" ... From 8c91d0e828ee6e49e909fa2f6bc5f1ec9f59d605 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 11:42:48 +0100 Subject: [PATCH 099/173] Updated types Signed-off-by: Elena Khaustova --- kedro/framework/context/context.py | 8 ++--- kedro/framework/hooks/specs.py | 16 +++++----- kedro/io/__init__.py | 2 ++ kedro/runner/parallel_runner.py | 18 ++++++----- kedro/runner/runner.py | 50 +++++++++++++++--------------- kedro/runner/sequential_runner.py | 8 ++--- kedro/runner/thread_runner.py | 8 ++--- 7 files changed, 57 insertions(+), 53 deletions(-) diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 3b61b747f6..25995eb63f 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -14,7 +14,7 @@ from kedro.config import AbstractConfigLoader, MissingConfigException from kedro.framework.project import settings -from kedro.io import DataCatalog # noqa: TCH001 +from kedro.io import CatalogProtocol, DataCatalog # noqa: TCH001 from kedro.pipeline.transcoding import _transcode_split if TYPE_CHECKING: @@ -123,7 +123,7 @@ def _convert_paths_to_absolute_posix( return conf_dictionary -def _validate_transcoded_datasets(catalog: DataCatalog) -> None: +def _validate_transcoded_datasets(catalog: CatalogProtocol) -> None: """Validates transcoded datasets are correctly named Args: @@ -178,7 +178,7 @@ class KedroContext: ) @property - def catalog(self) -> DataCatalog: + def catalog(self) -> CatalogProtocol: """Read-only property referring to Kedro's ``DataCatalog`` for this context. Returns: @@ -213,7 +213,7 @@ def _get_catalog( self, save_version: str | None = None, load_versions: dict[str, str] | None = None, - ) -> DataCatalog: + ) -> CatalogProtocol: """A hook for changing the creation of a DataCatalog instance. Returns: diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index b0037a0878..3fd4871aee 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from kedro.framework.context import KedroContext - from kedro.io import DataCatalog + from kedro.io import CatalogProtocol from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -22,7 +22,7 @@ class DataCatalogSpecs: @hook_spec def after_catalog_created( # noqa: PLR0913 self, - catalog: DataCatalog, + catalog: CatalogProtocol, conf_catalog: dict[str, Any], conf_creds: dict[str, Any], feed_dict: dict[str, Any], @@ -53,7 +53,7 @@ class NodeSpecs: def before_node_run( self, node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -81,7 +81,7 @@ def before_node_run( def after_node_run( # noqa: PLR0913 self, node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, inputs: dict[str, Any], outputs: dict[str, Any], is_async: bool, @@ -110,7 +110,7 @@ def on_node_error( # noqa: PLR0913 self, error: Exception, node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, inputs: dict[str, Any], is_async: bool, session_id: str, @@ -137,7 +137,7 @@ class PipelineSpecs: @hook_spec def before_pipeline_run( - self, run_params: dict[str, Any], pipeline: Pipeline, catalog: DataCatalog + self, run_params: dict[str, Any], pipeline: Pipeline, catalog: CatalogProtocol ) -> None: """Hook to be invoked before a pipeline runs. @@ -174,7 +174,7 @@ def after_pipeline_run( run_params: dict[str, Any], run_result: dict[str, Any], pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, ) -> None: """Hook to be invoked after a pipeline runs. @@ -212,7 +212,7 @@ def on_pipeline_error( error: Exception, run_params: dict[str, Any], pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, ) -> None: """Hook to be invoked if a pipeline run throws an uncaught Exception. The signature of this error hook should match the signature of ``before_pipeline_run`` diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 4b4a2e1b52..c4d968c2ba 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -9,6 +9,7 @@ from .core import ( AbstractDataset, AbstractVersionedDataset, + CatalogProtocol, DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, @@ -23,6 +24,7 @@ "AbstractDataset", "AbstractVersionedDataset", "CachedDataset", + "CatalogProtocol", "DataCatalog", "CatalogConfigResolver", "DatasetAlreadyExistsError", diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 62d7e1216b..903c9ece99 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -22,7 +22,7 @@ ) from kedro.framework.project import settings from kedro.io import ( - DataCatalog, + CatalogProtocol, DatasetNotFoundError, MemoryDataset, SharedMemoryDataset, @@ -60,7 +60,7 @@ def _bootstrap_subprocess( def _run_node_synchronization( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, is_async: bool = False, session_id: str | None = None, package_name: str | None = None, @@ -73,7 +73,7 @@ def _run_node_synchronization( # noqa: PLR0913 Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``CatalogProtocol`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. session_id: The session id of the pipeline run. @@ -118,7 +118,7 @@ def __init__( cannot be larger than 61 and will be set to min(61, max_workers). is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol during the run. This is used to set the default datasets to SharedMemoryDataset for `ParallelRunner`. @@ -168,7 +168,7 @@ def _validate_nodes(cls, nodes: Iterable[Node]) -> None: ) @classmethod - def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline) -> None: + def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None: """Ensure that all data sets are serialisable and that we do not have any non proxied memory data sets being used as outputs as their content will not be synchronized across threads. @@ -213,7 +213,9 @@ def _validate_catalog(cls, catalog: DataCatalog, pipeline: Pipeline) -> None: f"MemoryDatasets" ) - def _set_manager_datasets(self, catalog: DataCatalog, pipeline: Pipeline) -> None: + def _set_manager_datasets( + self, catalog: CatalogProtocol, pipeline: Pipeline + ) -> None: for dataset in pipeline.datasets(): try: catalog.exists(dataset) @@ -240,7 +242,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -248,7 +250,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 6f165e87c0..db397e5f84 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -21,7 +21,7 @@ from more_itertools import interleave from kedro.framework.hooks.manager import _NullPluginManager -from kedro.io import DataCatalog, MemoryDataset +from kedro.io import CatalogProtocol, MemoryDataset from kedro.pipeline import Pipeline if TYPE_CHECKING: @@ -45,7 +45,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol during the run. This is used to set the default datasets on the Runner instances. """ @@ -59,7 +59,7 @@ def _logger(self) -> logging.Logger: def run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager | None = None, session_id: str | None = None, ) -> dict[str, Any]: @@ -68,7 +68,7 @@ def run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -76,7 +76,7 @@ def run( ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: - Any node outputs that cannot be processed by the ``DataCatalog``. + Any node outputs that cannot be processed by the ``CatalogProtocol``. These are returned in a dictionary, where the keys are defined by the node outputs. @@ -94,7 +94,7 @@ def run( if unsatisfied: raise ValueError( - f"Pipeline input(s) {unsatisfied} not found in the DataCatalog" + f"Pipeline input(s) {unsatisfied} not found in the CatalogProtocol" ) # Identify MemoryDataset in the catalog @@ -124,7 +124,7 @@ def run( return {ds_name: catalog.load(ds_name) for ds_name in free_outputs} def run_only_missing( - self, pipeline: Pipeline, catalog: DataCatalog, hook_manager: PluginManager + self, pipeline: Pipeline, catalog: CatalogProtocol, hook_manager: PluginManager ) -> dict[str, Any]: """Run only the missing outputs from the ``Pipeline`` using the datasets provided by ``catalog``, and save results back to the @@ -132,7 +132,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be @@ -140,7 +140,7 @@ def run_only_missing( Returns: Any node outputs that cannot be processed by the - ``DataCatalog``. These are returned in a dictionary, where + ``CatalogProtocol``. These are returned in a dictionary, where the keys are defined by the node outputs. """ @@ -164,7 +164,7 @@ def run_only_missing( def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -173,7 +173,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -184,7 +184,7 @@ def _suggest_resume_scenario( self, pipeline: Pipeline, done_nodes: Iterable[Node], - catalog: DataCatalog, + catalog: CatalogProtocol, ) -> None: """ Suggest a command to the user to resume a run after it fails. @@ -194,7 +194,7 @@ def _suggest_resume_scenario( Args: pipeline: the ``Pipeline`` of the run. done_nodes: the ``Node``s that executed successfully. - catalog: the ``DataCatalog`` of the run. + catalog: the ``CatalogProtocol`` of the run. """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) @@ -223,7 +223,7 @@ def _suggest_resume_scenario( def _find_nodes_to_resume_from( - pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: DataCatalog + pipeline: Pipeline, unfinished_nodes: Collection[Node], catalog: CatalogProtocol ) -> set[str]: """Given a collection of unfinished nodes in a pipeline using a certain catalog, find the node names to pass to pipeline.from_nodes() @@ -233,7 +233,7 @@ def _find_nodes_to_resume_from( Args: pipeline: the ``Pipeline`` to find starting nodes for. unfinished_nodes: collection of ``Node``s that have not finished yet - catalog: the ``DataCatalog`` of the run. + catalog: the ``CatalogProtocol`` of the run. Returns: Set of node names to pass to pipeline.from_nodes() to continue @@ -251,7 +251,7 @@ def _find_nodes_to_resume_from( def _find_all_nodes_for_resumed_pipeline( - pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: DataCatalog + pipeline: Pipeline, unfinished_nodes: Iterable[Node], catalog: CatalogProtocol ) -> set[Node]: """Breadth-first search approach to finding the complete set of ``Node``s which need to run to cover all unfinished nodes, @@ -261,7 +261,7 @@ def _find_all_nodes_for_resumed_pipeline( Args: pipeline: the ``Pipeline`` to analyze. unfinished_nodes: the iterable of ``Node``s which have not finished yet. - catalog: the ``DataCatalog`` of the run. + catalog: the ``CatalogProtocol`` of the run. Returns: A set containing all input unfinished ``Node``s and all remaining @@ -309,12 +309,12 @@ def _nodes_with_external_inputs(nodes_of_interest: Iterable[Node]) -> set[Node]: return set(p_nodes_with_external_inputs.nodes) -def _enumerate_non_persistent_inputs(node: Node, catalog: DataCatalog) -> set[str]: +def _enumerate_non_persistent_inputs(node: Node, catalog: CatalogProtocol) -> set[str]: """Enumerate non-persistent input datasets of a ``Node``. Args: node: the ``Node`` to check the inputs of. - catalog: the ``DataCatalog`` of the run. + catalog: the ``CatalogProtocol`` of the run. Returns: Set of names of non-persistent inputs of given ``Node``. @@ -379,7 +379,7 @@ def _find_initial_node_group(pipeline: Pipeline, nodes: Iterable[Node]) -> list[ def run_node( node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, is_async: bool = False, session_id: str | None = None, @@ -388,7 +388,7 @@ def run_node( Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A ``CatalogProtocol`` containing the node's inputs and outputs. hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. @@ -422,7 +422,7 @@ def run_node( def _collect_inputs_from_hook( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -455,7 +455,7 @@ def _collect_inputs_from_hook( # noqa: PLR0913 def _call_node_run( # noqa: PLR0913 node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, inputs: dict[str, Any], is_async: bool, hook_manager: PluginManager, @@ -486,7 +486,7 @@ def _call_node_run( # noqa: PLR0913 def _run_node_sequential( node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: @@ -533,7 +533,7 @@ def _run_node_sequential( def _run_node_async( node: Node, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> Node: diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 48dac3cd54..fd5ec26834 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import DataCatalog + from kedro.io import CatalogProtocol from kedro.pipeline import Pipeline @@ -34,7 +34,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol during the run. This is used to set the default datasets to MemoryDataset for `SequentialRunner`. @@ -48,7 +48,7 @@ def __init__( def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -56,7 +56,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index b4751a602a..39a726db46 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io import DataCatalog + from kedro.io import CatalogProtocol from kedro.pipeline import Pipeline from kedro.pipeline.node import Node @@ -43,7 +43,7 @@ def __init__( is_async: If True, set to False, because `ThreadRunner` doesn't support loading and saving the node inputs and outputs asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the DataCatalog + extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol during the run. This is used to set the default datasets to MemoryDataset for `ThreadRunner`. @@ -87,7 +87,7 @@ def _get_required_workers_count(self, pipeline: Pipeline) -> int: def _run( self, pipeline: Pipeline, - catalog: DataCatalog, + catalog: CatalogProtocol, hook_manager: PluginManager, session_id: str | None = None, ) -> None: @@ -95,7 +95,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``DataCatalog`` from which to fetch data. + catalog: The ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. From 18d2ba0100595b70a54688e4b9d45f1b19ed8c5d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 11:47:58 +0100 Subject: [PATCH 100/173] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/core.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index ecdb85d505..443081e2de 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -905,7 +905,10 @@ def from_config(cls, catalog: dict[str, dict[str, Any]] | None) -> _C: ... def _get_dataset( - self, ds_name: str, suggest: bool = True, version: Any = None + self, + dataset_name: str, + version: Any = None, + suggest: bool = True, ) -> AbstractDataset: """Retrieve a dataset by its name.""" ... @@ -918,7 +921,7 @@ def save(self, name: str, data: Any) -> None: """Save data to a registered dataset.""" ... - def load(self, name: str, version: str | None = None) -> _DO: + def load(self, name: str, version: str | None = None) -> Any: """Load data from a registered dataset.""" ... From d48c6d3e024bd924854c5e5fd0173089c6bf74b5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 13:05:28 +0100 Subject: [PATCH 101/173] Added _ImplementsCatalogProtocolValidator Signed-off-by: Elena Khaustova --- kedro/framework/project/__init__.py | 25 +++++++++++++++++++++++-- kedro/io/core.py | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index a3248b9daf..195fa077f6 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -20,6 +20,7 @@ from dynaconf import LazySettings from dynaconf.validator import ValidationError, Validator +from kedro.io import CatalogProtocol from kedro.pipeline import Pipeline, pipeline if TYPE_CHECKING: @@ -59,6 +60,25 @@ def validate( ) +class _ImplementsCatalogProtocolValidator(Validator): + """A validator to check if the supplied setting value is a subclass of the default class""" + + def validate( + self, settings: dynaconf.base.Settings, *args: Any, **kwargs: Any + ) -> None: + super().validate(settings, *args, **kwargs) + + protocol = CatalogProtocol + for name in self.names: + setting_value = getattr(settings, name) + if not isinstance(setting_value(), protocol): + raise ValidationError( + f"Invalid value '{setting_value.__module__}.{setting_value.__qualname__}' " + f"received for setting '{name}'. It must implement " + f"'{protocol.__module__}.{protocol.__qualname__}'." + ) + + class _HasSharedParentClassValidator(Validator): """A validator to check that the parent of the default class is an ancestor of the settings value.""" @@ -115,8 +135,9 @@ class _ProjectSettings(LazySettings): _CONFIG_LOADER_ARGS = Validator( "CONFIG_LOADER_ARGS", default={"base_env": "base", "default_run_env": "local"} ) - _DATA_CATALOG_CLASS = _IsSubclassValidator( - "DATA_CATALOG_CLASS", default=_get_default_class("kedro.io.DataCatalog") + _DATA_CATALOG_CLASS = _ImplementsCatalogProtocolValidator( + "DATA_CATALOG_CLASS", + default=_get_default_class("kedro.io.DataCatalog"), ) def __init__(self, *args: Any, **kwargs: Any): diff --git a/kedro/io/core.py b/kedro/io/core.py index 443081e2de..c0966ea984 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -887,7 +887,7 @@ def validate_on_forbidden_chars(**kwargs: Any) -> None: @runtime_checkable -class CatalogProtocol(Protocol["_C"]): +class CatalogProtocol(Protocol[_C]): _datasets: dict[str, AbstractDataset] def __contains__(self, ds_name: str) -> bool: From 45ce6bcabaf7408cd8b92461ecbd10f7d9f5986f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 15:20:03 +0100 Subject: [PATCH 102/173] Updated docstrings Signed-off-by: Elena Khaustova --- kedro/framework/context/context.py | 12 ++++++------ kedro/runner/parallel_runner.py | 6 +++--- kedro/runner/runner.py | 24 ++++++++++++------------ kedro/runner/sequential_runner.py | 4 ++-- kedro/runner/thread_runner.py | 4 ++-- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 25995eb63f..5c14cbae38 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -179,12 +179,12 @@ class KedroContext: @property def catalog(self) -> CatalogProtocol: - """Read-only property referring to Kedro's ``DataCatalog`` for this context. + """Read-only property referring to Kedro's catalog` for this context. Returns: - DataCatalog defined in `catalog.yml`. + catalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. + KedroContextError: Incorrect catalog registered for the project. """ return self._get_catalog() @@ -214,12 +214,12 @@ def _get_catalog( save_version: str | None = None, load_versions: dict[str, str] | None = None, ) -> CatalogProtocol: - """A hook for changing the creation of a DataCatalog instance. + """A hook for changing the creation of a catalog instance. Returns: - DataCatalog defined in `catalog.yml`. + catalog defined in `catalog.yml`. Raises: - KedroContextError: Incorrect ``DataCatalog`` registered for the project. + KedroContextError: Incorrect catalog registered for the project. """ # '**/catalog*' reads modular pipeline configs diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 903c9ece99..6c56a54b4e 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -73,7 +73,7 @@ def _run_node_synchronization( # noqa: PLR0913 Args: node: The ``Node`` to run. - catalog: A ``CatalogProtocol`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. session_id: The session id of the pipeline run. @@ -118,7 +118,7 @@ def __init__( cannot be larger than 61 and will be set to min(61, max_workers). is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol + extra_dataset_patterns: Extra dataset factory patterns to be added to the catalog during the run. This is used to set the default datasets to SharedMemoryDataset for `ParallelRunner`. @@ -250,7 +250,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The `catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index db397e5f84..48cdef7d20 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -45,7 +45,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol + extra_dataset_patterns: Extra dataset factory patterns to be added to the catalog during the run. This is used to set the default datasets on the Runner instances. """ @@ -68,7 +68,7 @@ def run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -76,7 +76,7 @@ def run( ValueError: Raised when ``Pipeline`` inputs cannot be satisfied. Returns: - Any node outputs that cannot be processed by the ``CatalogProtocol``. + Any node outputs that cannot be processed by the catalog. These are returned in a dictionary, where the keys are defined by the node outputs. @@ -94,7 +94,7 @@ def run( if unsatisfied: raise ValueError( - f"Pipeline input(s) {unsatisfied} not found in the CatalogProtocol" + f"Pipeline input(s) {unsatisfied} not found in the {catalog.__class__.__name__}" ) # Identify MemoryDataset in the catalog @@ -132,7 +132,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The `catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be @@ -140,7 +140,7 @@ def run_only_missing( Returns: Any node outputs that cannot be processed by the - ``CatalogProtocol``. These are returned in a dictionary, where + catalog. These are returned in a dictionary, where the keys are defined by the node outputs. """ @@ -173,7 +173,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The `catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -194,7 +194,7 @@ def _suggest_resume_scenario( Args: pipeline: the ``Pipeline`` of the run. done_nodes: the ``Node``s that executed successfully. - catalog: the ``CatalogProtocol`` of the run. + catalog: the catalog of the run. """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) @@ -233,7 +233,7 @@ def _find_nodes_to_resume_from( Args: pipeline: the ``Pipeline`` to find starting nodes for. unfinished_nodes: collection of ``Node``s that have not finished yet - catalog: the ``CatalogProtocol`` of the run. + catalog: the catalog of the run. Returns: Set of node names to pass to pipeline.from_nodes() to continue @@ -261,7 +261,7 @@ def _find_all_nodes_for_resumed_pipeline( Args: pipeline: the ``Pipeline`` to analyze. unfinished_nodes: the iterable of ``Node``s which have not finished yet. - catalog: the ``CatalogProtocol`` of the run. + catalog: the catalog of the run. Returns: A set containing all input unfinished ``Node``s and all remaining @@ -314,7 +314,7 @@ def _enumerate_non_persistent_inputs(node: Node, catalog: CatalogProtocol) -> se Args: node: the ``Node`` to check the inputs of. - catalog: the ``CatalogProtocol`` of the run. + catalog: the catalog of the run. Returns: Set of names of non-persistent inputs of given ``Node``. @@ -388,7 +388,7 @@ def run_node( Args: node: The ``Node`` to run. - catalog: A ``CatalogProtocol`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index fd5ec26834..3f60414b2b 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -34,7 +34,7 @@ def __init__( Args: is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol + extra_dataset_patterns: Extra dataset factory patterns to be added to the catalog during the run. This is used to set the default datasets to MemoryDataset for `SequentialRunner`. @@ -56,7 +56,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index 39a726db46..d2c0c251dc 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -43,7 +43,7 @@ def __init__( is_async: If True, set to False, because `ThreadRunner` doesn't support loading and saving the node inputs and outputs asynchronously with threads. Defaults to False. - extra_dataset_patterns: Extra dataset factory patterns to be added to the CatalogProtocol + extra_dataset_patterns: Extra dataset factory patterns to be added to the catalog during the run. This is used to set the default datasets to MemoryDataset for `ThreadRunner`. @@ -95,7 +95,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The ``CatalogProtocol`` from which to fetch data. + catalog: The catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. From 6ca972f89a695943764da91dff6b1a792f3501f0 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 22:45:31 +0100 Subject: [PATCH 103/173] Fixed tests Signed-off-by: Elena Khaustova --- tests/framework/context/test_context.py | 2 +- tests/runner/test_sequential_runner.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/framework/context/test_context.py b/tests/framework/context/test_context.py index 61e4bbaa6f..ea62cb04c9 100644 --- a/tests/framework/context/test_context.py +++ b/tests/framework/context/test_context.py @@ -261,7 +261,7 @@ def test_wrong_catalog_type(self, mock_settings_file_bad_data_catalog_class): pattern = ( "Invalid value 'tests.framework.context.test_context.BadCatalog' received " "for setting 'DATA_CATALOG_CLASS'. " - "It must be a subclass of 'kedro.io.data_catalog.DataCatalog'." + "It must implement 'kedro.io.core.CatalogProtocol'." ) mock_settings = _ProjectSettings( settings_file=str(mock_settings_file_bad_data_catalog_class) diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index dbc73a30f0..4f22bab296 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -130,7 +130,9 @@ def test_conflict_feed_catalog( def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline, catalog): """ds1, ds2 and ds3 were not specified.""" - with pytest.raises(ValueError, match=r"not found in the DataCatalog"): + with pytest.raises( + ValueError, match=rf"not found in the {catalog.__class__.__name__}" + ): SequentialRunner(is_async=is_async).run( unfinished_outputs_pipeline, catalog ) From fdce5eadeab58a70a063ba0ada21805a9369c38a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 22:46:44 +0100 Subject: [PATCH 104/173] Fixed docs Signed-off-by: Elena Khaustova --- docs/source/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 2c3a2c4c00..a61ba1b08f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -130,6 +130,7 @@ "kedro.io.catalog_config_resolver.CatalogConfigResolver", "kedro.io.core.AbstractDataset", "kedro.io.core.AbstractVersionedDataset", + "kedro.io.core.CatalogProtocol", "kedro.io.core.DatasetError", "kedro.io.core.Version", "kedro.io.data_catalog.DataCatalog", @@ -170,6 +171,7 @@ "None. Update D from mapping/iterable E and F.", "Patterns", "CatalogConfigResolver", + "CatalogProtocol", ), "py:data": ( "typing.Any", From 3029963a250097e0f4f83a0f730a6d92ef27db97 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 22:47:39 +0100 Subject: [PATCH 105/173] Excluded Potocol from coverage Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c0966ea984..1ed35a871b 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -939,7 +939,7 @@ def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None def exists(self, name: str) -> bool: """Checks whether registered data set exists by calling its `exists()` method.""" - pass + ... def release(self, name: str) -> None: """Release any cached data associated with a dataset.""" diff --git a/pyproject.toml b/pyproject.toml index 8b7b4cb09b..d9ebbfd70b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,7 +134,7 @@ omit = [ "kedro/runner/parallel_runner.py", "*/site-packages/*", ] -exclude_also = ["raise NotImplementedError", "if TYPE_CHECKING:"] +exclude_also = ["raise NotImplementedError", "if TYPE_CHECKING:", "class CatalogProtocol"] [tool.pytest.ini_options] addopts=""" From 0833a843fdf189ab10023bcc0d278b562405a6b8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 12 Sep 2024 23:00:37 +0100 Subject: [PATCH 106/173] Fixed docs Signed-off-by: Elena Khaustova --- kedro/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 48cdef7d20..bb680aefe6 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -132,7 +132,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The `catalog from which to fetch data. + catalog: The catalog from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be From 07908a8c83ae4af5d0bac9ad6c12e37e317046ec Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 10:59:19 +0100 Subject: [PATCH 107/173] Renamed catalog source to kedro_data_catalog Signed-off-by: Elena Khaustova --- docs/source/conf.py | 2 +- kedro/framework/project/__init__.py | 2 +- kedro/io/__init__.py | 2 +- kedro/io/{data_catalog_redesign.py => kedro_data_catalog.py} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename kedro/io/{data_catalog_redesign.py => kedro_data_catalog.py} (100%) diff --git a/docs/source/conf.py b/docs/source/conf.py index 029a1db6e6..5529973d2f 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -133,7 +133,7 @@ "kedro.io.core.DatasetError", "kedro.io.core.Version", "kedro.io.data_catalog.DataCatalog", - "kedro.io.data_catalog_redesign.KedroDataCatalog", + "kedro.io.kedro_data_catalog.KedroDataCatalog", "kedro.io.memory_dataset.MemoryDataset", "kedro.io.partitioned_dataset.PartitionedDataset", "kedro.pipeline.pipeline.Pipeline", diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index a16e033528..5d76bb78fa 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -55,7 +55,7 @@ def validate( # TODO: remove with the old catalog if ( f"{setting_value.__module__}.{setting_value.__qualname__}" - == "kedro.io.data_catalog_redesign.KedroDataCatalog" + == "kedro.io.kedro_data_catalog.KedroDataCatalog" and f"{default_class.__module__}.{default_class.__qualname__}" == "kedro.io.data_catalog.DataCatalog" ): diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 1f45ec7bcd..b2a2b2c4c6 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -15,7 +15,7 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import BaseDataCatalog, KedroDataCatalog +from .kedro_data_catalog import BaseDataCatalog, KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/kedro_data_catalog.py similarity index 100% rename from kedro/io/data_catalog_redesign.py rename to kedro/io/kedro_data_catalog.py From 25a6fcf0ca4bd21b8eccda734eb562c511b46719 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 11:01:44 +0100 Subject: [PATCH 108/173] Renamed data set to dataset in docstrings Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 375ba6bcaf..61a00f2535 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -1,8 +1,8 @@ """``KedroDataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To -use a ``KedroDataCatalog``, you need to instantiate it with a dictionary of data -sets. Then it will act as a single point of reference for your calls, -relaying load and save functions to the underlying data sets. +use a ``KedroDataCatalog``, you need to instantiate it with a dictionary of datasets. +Then it will act as a single point of reference for your calls, relaying load and +save functions to the underlying datasets. """ from __future__ import annotations @@ -207,7 +207,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return list(self._datasets.keys()) if not regex_search.strip(): - self._logger.warning("The empty string will not match any data sets") + self._logger.warning("The empty string will not match any datasets") return [] try: @@ -219,7 +219,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: - """Save data to a registered data set.""" + """Save data to a registered dataset.""" dataset = self.get_dataset(name) self._logger.info( @@ -232,11 +232,11 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def release(self, name: str) -> None: - """Release any cached data associated with a data set + """Release any cached data associated with a dataset Args: - name: A data set to be checked. + name: A dataset to be checked. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. """ dataset = self.get_dataset(name) @@ -258,7 +258,7 @@ def confirm(self, name: str) -> None: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set.""" + """Loads a registered dataset.""" load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) @@ -300,15 +300,15 @@ def shallow_copy( return self def exists(self, name: str) -> bool: - """Checks whether registered data set exists by calling its `exists()` + """Checks whether registered dataset exists by calling its `exists()` method. Raises a warning and returns False if `exists()` is not implemented. Args: - name: A data set to be checked. + name: A dataset to be checked. Returns: - Whether the data set output exists. + Whether the dataset output exists. """ try: From 07f8c126e4ed7000bcbf14c799dafcd36ff4cc25 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 11:24:24 +0100 Subject: [PATCH 109/173] Updated add_from_dict Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 41 ++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 61a00f2535..9fd1f7681e 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -32,11 +32,33 @@ class KedroDataCatalog: def __init__( self, datasets: dict[str, AbstractDataset] | None = None, - feed_dict: dict[str, Any] | None = None, + data: dict[str, Any] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, config_resolver: CatalogConfigResolver | None = None, ) -> None: + """``KedroDataCatalog`` stores instances of ``AbstractDataset`` + implementations to provide ``load`` and ``save`` capabilities from + anywhere in the program. To use a ``KedroDataCatalog``, you need to + instantiate it with a dictionary of data sets. Then it will act as a + single point of reference for your calls, relaying load and save + functions to the underlying data sets. + + Args: + datasets: A dictionary of data set names and data set instances. + data: A dictionary with data to be added in memory as `MemoryDataset`` instances. + Keys represent dataset names and the values are raw data. + load_versions: A mapping between data set names and versions + to load. Has no effect on data sets without enabled versioning. + save_version: Version string to be used for ``save`` operations + by all data sets with enabled versioning. It must: a) be a + case-insensitive string that conforms with operating system + filename limitations, b) always return the latest version when + sorted in lexicographical order.. + config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. + + + """ self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} self._load_versions = load_versions or {} @@ -47,8 +69,8 @@ def __init__( for ds_name, ds_config in self._config_resolver.config.items(): self._add_from_config(ds_name, ds_config) - if feed_dict: - self.add_from_dict(feed_dict) + if data: + self.add_from_dict(data) @property def datasets(self) -> dict[str, Any]: @@ -271,16 +293,11 @@ def load(self, name: str, version: str | None = None) -> Any: return dataset.load() - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - # Consider changing to add memory datasets only, to simplify the method, + def add_from_dict(self, data: dict[str, Any], replace: bool = False) -> None: + # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method - for ds_name, ds_data in datasets.items(): - dataset = ( - ds_data - if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) # type: ignore[abstract] - ) - self.add(ds_name, dataset, replace) + for ds_name, ds_data in data.items(): + self.add(ds_name, MemoryDataset(data=ds_data), replace) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog From 3a1a0f26195ac217267f2d53ff8e68ba40acd6a9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 11:38:51 +0100 Subject: [PATCH 110/173] Revised comments and TODOs Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 9fd1f7681e..9256d2a643 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -32,7 +32,7 @@ class KedroDataCatalog: def __init__( self, datasets: dict[str, AbstractDataset] | None = None, - data: dict[str, Any] | None = None, + raw_data: dict[str, Any] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, config_resolver: CatalogConfigResolver | None = None, @@ -46,7 +46,7 @@ def __init__( Args: datasets: A dictionary of data set names and data set instances. - data: A dictionary with data to be added in memory as `MemoryDataset`` instances. + raw_data: A dictionary with data to be added in memory as `MemoryDataset`` instances. Keys represent dataset names and the values are raw data. load_versions: A mapping between data set names and versions to load. Has no effect on data sets without enabled versioning. @@ -69,8 +69,8 @@ def __init__( for ds_name, ds_config in self._config_resolver.config.items(): self._add_from_config(ds_name, ds_config) - if data: - self.add_from_dict(data) + if raw_data: + self.add_raw_data(raw_data) @property def datasets(self) -> dict[str, Any]: @@ -159,8 +159,8 @@ def _validate_dataset_config(ds_name: str, ds_config: Any) -> None: ) def _add_from_config(self, ds_name: str, ds_config: dict[str, Any]) -> None: - # Add lazy loading feature to store the configuration but not to init actual dataset - # Initialise actual dataset when load or save + # TODO: Add lazy loading feature to store the configuration but not to init actual dataset + # TODO: Initialise actual dataset when load or save self._validate_dataset_config(ds_name, ds_config) ds = AbstractDataset.from_config( ds_name, @@ -177,7 +177,7 @@ def get_dataset( ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) if ds_name not in self._datasets and ds_config is not None: - self._add_from_config(ds_name, ds_config) # type: ignore[arg-type] + self._add_from_config(ds_name, ds_config) dataset = self._datasets.get(ds_name, None) @@ -293,7 +293,7 @@ def load(self, name: str, version: str | None = None) -> Any: return dataset.load() - def add_from_dict(self, data: dict[str, Any], replace: bool = False) -> None: + def add_raw_data(self, data: dict[str, Any], replace: bool = False) -> None: # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method for ds_name, ds_data in data.items(): @@ -301,7 +301,7 @@ def add_from_dict(self, data: dict[str, Any], replace: bool = False) -> None: def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog - return self.add_from_dict(feed_dict, replace) + return self.add_raw_data(feed_dict, replace) def shallow_copy( self, extra_dataset_patterns: Patterns | None = None From cf663a09139f995c64c1f1c2cca4aa36c3879f46 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 11:42:08 +0100 Subject: [PATCH 111/173] Updated error message to point to specific catalog type Signed-off-by: Elena Khaustova --- kedro/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 2ace73a10c..2b64cb6bf1 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -94,7 +94,7 @@ def run( if unsatisfied: raise ValueError( - f"Pipeline input(s) {unsatisfied} not found in the BaseDataCatalog" + f"Pipeline input(s) {unsatisfied} not found in the {catalog.__class__.__name__}" ) # Identify MemoryDataset in the catalog From caa7316f4c57483f1ce2059106414df83872489f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 12:06:08 +0100 Subject: [PATCH 112/173] Fixed tests Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- tests/io/test_data_catalog.py | 4 ++-- tests/runner/test_sequential_runner.py | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 9256d2a643..cba1b8a28f 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -297,7 +297,7 @@ def add_raw_data(self, data: dict[str, Any], replace: bool = False) -> None: # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method for ds_name, ds_data in data.items(): - self.add(ds_name, MemoryDataset(data=ds_data), replace) + self.add(ds_name, MemoryDataset(data=ds_data), replace) # type: ignore[abstract] def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index be8ed0831e..db777cc634 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -925,7 +925,7 @@ def test_multiple_catch_all_patterns_not_allowed( } with pytest.raises( - ValueError, match="Multiple catch-all patterns found in the catalog" + DatasetError, match="Multiple catch-all patterns found in the catalog" ): DataCatalog.from_config(**config_with_dataset_factories) @@ -1019,7 +1019,7 @@ def test_unmatched_key_error_when_parsing_config( "Unable to resolve 'data/01_raw/{brand}_plane.pq' from the pattern '{type}@planes'. " "Keys used in the configuration should be present in the dataset factory pattern." ) - with pytest.raises(KeyError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): catalog._get_dataset("jet@planes") def test_factory_config_versioned( diff --git a/tests/runner/test_sequential_runner.py b/tests/runner/test_sequential_runner.py index c8f54f7337..4f22bab296 100644 --- a/tests/runner/test_sequential_runner.py +++ b/tests/runner/test_sequential_runner.py @@ -130,7 +130,9 @@ def test_conflict_feed_catalog( def test_unsatisfied_inputs(self, is_async, unfinished_outputs_pipeline, catalog): """ds1, ds2 and ds3 were not specified.""" - with pytest.raises(ValueError, match=r"not found in the BaseDataCatalog"): + with pytest.raises( + ValueError, match=rf"not found in the {catalog.__class__.__name__}" + ): SequentialRunner(is_async=is_async).run( unfinished_outputs_pipeline, catalog ) From 0ac154daa3f7a3c0f97d661ac04b65bc6e3f3bee Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 12:19:59 +0100 Subject: [PATCH 113/173] Merged with protocol Signed-off-by: Elena Khaustova --- kedro/framework/hooks/specs.py | 12 ++++++------ kedro/framework/project/__init__.py | 9 --------- kedro/io/__init__.py | 3 +-- kedro/io/kedro_data_catalog.py | 6 +----- 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index c80e9625ec..996134920e 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -63,7 +63,7 @@ def before_node_run( Args: node: The ``Node`` to run. - catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -93,7 +93,7 @@ def after_node_run( # noqa: PLR0913 Args: node: The ``Node`` that ran. - catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -122,7 +122,7 @@ def on_node_error( # noqa: PLR0913 Args: error: The uncaught exception thrown during the node run. node: The ``Node`` to run. - catalog: A ``BaseDataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -164,7 +164,7 @@ def before_pipeline_run( } pipeline: The ``Pipeline`` that will be run. - catalog: The ``BaseDataCatalog`` to be used during the run. + catalog: The catalog to be used during the run. """ pass @@ -202,7 +202,7 @@ def after_pipeline_run( run_result: The output of ``Pipeline`` run. pipeline: The ``Pipeline`` that was run. - catalog: The ``BaseDataCatalog`` used during the run. + catalog: The catalog used during the run. """ pass @@ -242,7 +242,7 @@ def on_pipeline_error( } pipeline: The ``Pipeline`` that will was run. - catalog: The ``BaseDataCatalog`` used during the run. + catalog: The catalog used during the run. """ pass diff --git a/kedro/framework/project/__init__.py b/kedro/framework/project/__init__.py index 094cfd4aef..195fa077f6 100644 --- a/kedro/framework/project/__init__.py +++ b/kedro/framework/project/__init__.py @@ -52,15 +52,6 @@ def validate( default_class = self.default(settings, self) for name in self.names: setting_value = getattr(settings, name) - # Allow using new KedroDataCatalog - # TODO: remove with the old catalog - if ( - f"{setting_value.__module__}.{setting_value.__qualname__}" - == "kedro.io.kedro_data_catalog.KedroDataCatalog" - and f"{default_class.__module__}.{default_class.__qualname__}" - == "kedro.io.data_catalog.DataCatalog" - ): - continue if not issubclass(setting_value, default_class): raise ValidationError( f"Invalid value '{setting_value.__module__}.{setting_value.__qualname__}' " diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 17abb93e9e..9697e1bd35 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -16,14 +16,13 @@ Version, ) from .data_catalog import DataCatalog -from .kedro_data_catalog import BaseDataCatalog, KedroDataCatalog +from .kedro_data_catalog import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset __all__ = [ "AbstractDataset", - "BaseDataCatalog", "AbstractVersionedDataset", "CachedDataset", "CatalogProtocol", diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index cba1b8a28f..c63d557d66 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -11,9 +11,8 @@ import difflib import logging import re -from typing import Any, Union +from typing import Any -from kedro.io import DataCatalog from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, @@ -333,6 +332,3 @@ def exists(self, name: str) -> bool: except DatasetNotFoundError: return False return dataset.exists() - - -BaseDataCatalog = Union[DataCatalog, KedroDataCatalog] From 0ec1f23f17c4bca649042eaa6793a8eca684007d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 12:23:38 +0100 Subject: [PATCH 114/173] Removed reference to DataCatalog in docstrings Signed-off-by: Elena Khaustova --- kedro/framework/hooks/specs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 3fd4871aee..996134920e 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -63,7 +63,7 @@ def before_node_run( Args: node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -93,7 +93,7 @@ def after_node_run( # noqa: PLR0913 Args: node: The ``Node`` that ran. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -122,7 +122,7 @@ def on_node_error( # noqa: PLR0913 Args: error: The uncaught exception thrown during the node run. node: The ``Node`` to run. - catalog: A ``DataCatalog`` containing the node's inputs and outputs. + catalog: A catalog containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -164,7 +164,7 @@ def before_pipeline_run( } pipeline: The ``Pipeline`` that will be run. - catalog: The ``DataCatalog`` to be used during the run. + catalog: The catalog to be used during the run. """ pass @@ -202,7 +202,7 @@ def after_pipeline_run( run_result: The output of ``Pipeline`` run. pipeline: The ``Pipeline`` that was run. - catalog: The ``DataCatalog`` used during the run. + catalog: The catalog used during the run. """ pass @@ -242,7 +242,7 @@ def on_pipeline_error( } pipeline: The ``Pipeline`` that will was run. - catalog: The ``DataCatalog`` used during the run. + catalog: The catalog used during the run. """ pass From 4ecd8fdb99b45cb2ea8bd2aa4a8aaab2e48b90a3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 12:26:42 +0100 Subject: [PATCH 115/173] Fixed docs Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index b6e23f2ee9..50b719f117 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -173,6 +173,7 @@ "Patterns", "CatalogConfigResolver", "CatalogProtocol", + "KedroDataCatalog", ), "py:data": ( "typing.Any", From 11b3426ed7c911b3ed25cc1590e32cd9d9ba254a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 13:14:46 +0100 Subject: [PATCH 116/173] Reordered methods Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index c63d557d66..940f8d34f8 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -252,6 +252,20 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) + def load(self, name: str, version: str | None = None) -> Any: + """Loads a registered dataset.""" + load_version = Version(version, None) if version else None + dataset = self.get_dataset(name, version=load_version) + + self._logger.info( + "Loading data from %s (%s)...", + _format_rich(name, "dark_orange") if self._use_rich_markup else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + return dataset.load() + def release(self, name: str) -> None: """Release any cached data associated with a dataset Args: @@ -278,20 +292,6 @@ def confirm(self, name: str) -> None: else: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered dataset.""" - load_version = Version(version, None) if version else None - dataset = self.get_dataset(name, version=load_version) - - self._logger.info( - "Loading data from %s (%s)...", - _format_rich(name, "dark_orange") if self._use_rich_markup else name, - type(dataset).__name__, - extra={"markup": True}, - ) - - return dataset.load() - def add_raw_data(self, data: dict[str, Any], replace: bool = False) -> None: # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method From 741b682e7b3fb8cc306e58b7cc0a268b70ab1c18 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 13:18:19 +0100 Subject: [PATCH 117/173] Removed add_all from protocol Signed-off-by: Elena Khaustova --- kedro/io/core.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 1ed35a871b..bb7c066309 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -929,10 +929,6 @@ def add(self, ds_name: str, dataset: Any, replace: bool = False) -> None: """Add a new dataset to the catalog.""" ... - def add_all(self, datasets: dict[str, Any], replace: bool = False) -> None: - """Add a new dataset to the catalog.""" - ... - def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: """Add datasets to the catalog using the data provided through the `feed_dict`.""" ... From 0020095d61503849424d8f5d66675c729627f400 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 14:48:45 +0100 Subject: [PATCH 118/173] Changed the order of arguments Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 940f8d34f8..0349b44660 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -32,9 +32,9 @@ def __init__( self, datasets: dict[str, AbstractDataset] | None = None, raw_data: dict[str, Any] | None = None, + config_resolver: CatalogConfigResolver | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: CatalogConfigResolver | None = None, ) -> None: """``KedroDataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -47,6 +47,7 @@ def __init__( datasets: A dictionary of data set names and data set instances. raw_data: A dictionary with data to be added in memory as `MemoryDataset`` instances. Keys represent dataset names and the values are raw data. + config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. load_versions: A mapping between data set names and versions to load. Has no effect on data sets without enabled versioning. save_version: Version string to be used for ``save`` operations @@ -54,9 +55,6 @@ def __init__( case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order.. - config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. - - """ self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} From 78feb513801aa23628d06e75637d1afe82bce985 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 17:04:49 +0100 Subject: [PATCH 119/173] Updated docstrings Signed-off-by: Elena Khaustova --- kedro/framework/hooks/specs.py | 6 +++--- kedro/runner/parallel_runner.py | 2 +- kedro/runner/runner.py | 10 +++++----- kedro/runner/sequential_runner.py | 2 +- kedro/runner/thread_runner.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index 996134920e..de9922c353 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -63,7 +63,7 @@ def before_node_run( Args: node: The ``Node`` to run. - catalog: A catalog containing the node's inputs and outputs. + catalog: An implemented instance of ``CatalogProtocol`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -93,7 +93,7 @@ def after_node_run( # noqa: PLR0913 Args: node: The ``Node`` that ran. - catalog: A catalog containing the node's inputs and outputs. + catalog: An implemented instance of ``CatalogProtocol`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. @@ -122,7 +122,7 @@ def on_node_error( # noqa: PLR0913 Args: error: The uncaught exception thrown during the node run. node: The ``Node`` to run. - catalog: A catalog containing the node's inputs and outputs. + catalog: An implemented instance of ``CatalogProtocol`` containing the node's inputs and outputs. inputs: The dictionary of inputs dataset. The keys are dataset names and the values are the actual loaded input data, not the dataset instance. diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 6c56a54b4e..64f32df1c3 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -73,7 +73,7 @@ def _run_node_synchronization( # noqa: PLR0913 Args: node: The ``Node`` to run. - catalog: A catalog containing the node's inputs and outputs. + catalog: An implemented instance of ``CatalogProtocol`` containing the node's inputs and outputs. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. session_id: The session id of the pipeline run. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index bb680aefe6..8e907923b0 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -68,7 +68,7 @@ def run( Args: pipeline: The ``Pipeline`` to run. - catalog: The catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -132,7 +132,7 @@ def run_only_missing( Args: pipeline: The ``Pipeline`` to run. - catalog: The catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. Raises: ValueError: Raised when ``Pipeline`` inputs cannot be @@ -173,7 +173,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The `catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. @@ -194,7 +194,7 @@ def _suggest_resume_scenario( Args: pipeline: the ``Pipeline`` of the run. done_nodes: the ``Node``s that executed successfully. - catalog: the catalog of the run. + catalog: an implemented instance of ``CatalogProtocol`` of the run. """ remaining_nodes = set(pipeline.nodes) - set(done_nodes) @@ -388,7 +388,7 @@ def run_node( Args: node: The ``Node`` to run. - catalog: A catalog containing the node's inputs and outputs. + catalog: An implemented instance of ``CatalogProtocol`` containing the node's inputs and outputs. hook_manager: The ``PluginManager`` to activate hooks. is_async: If True, the node inputs and outputs are loaded and saved asynchronously with threads. Defaults to False. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 3f60414b2b..c888e737cf 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -56,7 +56,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index d2c0c251dc..5ad13b9153 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -95,7 +95,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. From 6bf912c3345217e9391a5438d926c7af5dae0b06 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 13 Sep 2024 17:13:02 +0100 Subject: [PATCH 120/173] Updated docstrings Signed-off-by: Elena Khaustova --- kedro/framework/hooks/specs.py | 6 +++--- kedro/runner/parallel_runner.py | 2 +- kedro/runner/runner.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kedro/framework/hooks/specs.py b/kedro/framework/hooks/specs.py index de9922c353..3b32eb294c 100644 --- a/kedro/framework/hooks/specs.py +++ b/kedro/framework/hooks/specs.py @@ -164,7 +164,7 @@ def before_pipeline_run( } pipeline: The ``Pipeline`` that will be run. - catalog: The catalog to be used during the run. + catalog: An implemented instance of ``CatalogProtocol`` to be used during the run. """ pass @@ -202,7 +202,7 @@ def after_pipeline_run( run_result: The output of ``Pipeline`` run. pipeline: The ``Pipeline`` that was run. - catalog: The catalog used during the run. + catalog: An implemented instance of ``CatalogProtocol`` used during the run. """ pass @@ -242,7 +242,7 @@ def on_pipeline_error( } pipeline: The ``Pipeline`` that will was run. - catalog: The catalog used during the run. + catalog: An implemented instance of ``CatalogProtocol`` used during the run. """ pass diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 64f32df1c3..d09601ff7e 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -250,7 +250,7 @@ def _run( Args: pipeline: The ``Pipeline`` to run. - catalog: The `catalog from which to fetch data. + catalog: An implemented instance of ``CatalogProtocol`` from which to fetch data. hook_manager: The ``PluginManager`` to activate hooks. session_id: The id of the session. diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 8e907923b0..f3a0889909 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -233,7 +233,7 @@ def _find_nodes_to_resume_from( Args: pipeline: the ``Pipeline`` to find starting nodes for. unfinished_nodes: collection of ``Node``s that have not finished yet - catalog: the catalog of the run. + catalog: an implemented instance of ``CatalogProtocol`` of the run. Returns: Set of node names to pass to pipeline.from_nodes() to continue @@ -261,7 +261,7 @@ def _find_all_nodes_for_resumed_pipeline( Args: pipeline: the ``Pipeline`` to analyze. unfinished_nodes: the iterable of ``Node``s which have not finished yet. - catalog: the catalog of the run. + catalog: an implemented instance of ``CatalogProtocol`` of the run. Returns: A set containing all input unfinished ``Node``s and all remaining @@ -314,7 +314,7 @@ def _enumerate_non_persistent_inputs(node: Node, catalog: CatalogProtocol) -> se Args: node: the ``Node`` to check the inputs of. - catalog: the catalog of the run. + catalog: an implemented instance of ``CatalogProtocol`` of the run. Returns: Set of names of non-persistent inputs of given ``Node``. From bcd2d3762769e55750a0772fd554b98d7f38f278 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 16 Sep 2024 11:30:26 +0100 Subject: [PATCH 121/173] Added __repr__ Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 0349b44660..c3a74922e3 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -83,6 +83,9 @@ def datasets(self, value: Any) -> None: def config_resolver(self) -> CatalogConfigResolver: return self._config_resolver + def __repr__(self) -> str: + return self._datasets.__repr__() + def __iter__(self) -> AbstractDataset: yield from self._datasets.values() From eb7e8f5ee1a93eace28523e27a84414f948f71b9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 16 Sep 2024 11:55:41 +0100 Subject: [PATCH 122/173] Made __getitem__ return deepcopy Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index c3a74922e3..1f0dad290d 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -90,7 +90,7 @@ def __iter__(self) -> AbstractDataset: yield from self._datasets.values() def __getitem__(self, ds_name: str) -> AbstractDataset: - return self.get_dataset(ds_name) + return copy.deepcopy(self.get_dataset(ds_name)) def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" From 7348c1229e4ea468bd5e46cd988d5ce186faabcb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 16 Sep 2024 14:52:53 +0100 Subject: [PATCH 123/173] Fixed bug in get_dataset() Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 1f0dad290d..a940522a5a 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -176,7 +176,7 @@ def get_dataset( ) -> AbstractDataset: ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) - if ds_name not in self._datasets and ds_config is not None: + if ds_name not in self._datasets and ds_config: self._add_from_config(ds_name, ds_config) dataset = self._datasets.get(ds_name, None) From 5aee9e9c65180578980df3ddf6a783e9e8fa856e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 16 Sep 2024 16:06:02 +0100 Subject: [PATCH 124/173] Fixed __eq__ Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index a940522a5a..24f532f86e 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -101,7 +101,7 @@ def __contains__(self, dataset_name: str) -> bool: def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] return (self._datasets, self._config_resolver.list_patterns()) == ( - other.datasets, + other._datasets, other.config_resolver.list_patterns(), ) From c9c7c9a03c9076f1aa32e06cda5d5b67803056d0 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 16 Sep 2024 16:30:55 +0100 Subject: [PATCH 125/173] Fixed docstrings Signed-off-by: Elena Khaustova --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index bb7c066309..0b722444d4 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -901,7 +901,7 @@ def config_resolver(self) -> CatalogConfigResolver: @classmethod def from_config(cls, catalog: dict[str, dict[str, Any]] | None) -> _C: - """Create a ``KedroDataCatalog`` instance from configuration.""" + """Create a catalog instance from configuration.""" ... def _get_dataset( From 4b8d90ce1ac122dae3fba1302464e8651f5a91d0 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 17 Sep 2024 11:11:41 +0100 Subject: [PATCH 126/173] Added __setitem__ Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 24f532f86e..9949abdd95 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -92,6 +92,14 @@ def __iter__(self) -> AbstractDataset: def __getitem__(self, ds_name: str) -> AbstractDataset: return copy.deepcopy(self.get_dataset(ds_name)) + def __setitem__(self, key: str, value: Any) -> None: + msg = "Operation not allowed!" + if key in self: + msg = f"{msg} Please change datasets through configuration." + else: + msg = f"{msg} Please use KedroDataCatalog.add() instead." + raise AttributeError(msg) + def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" return ( From 8f870a8fde0430484aeae334203a780349b76dee Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Tue, 17 Sep 2024 14:12:50 +0100 Subject: [PATCH 127/173] Unit tests for `KedroDataCatalog` (#4171) * Added KedroDataCatlog tests template Signed-off-by: Elena Khaustova * Added test save/load unregistered dataset Signed-off-by: Elena Khaustova * Added test_feed_dict Signed-off-by: Elena Khaustova * Added exists tests Signed-off-by: Elena Khaustova * Added tests for list() Signed-off-by: Elena Khaustova * Added test_eq Signed-off-by: Elena Khaustova * Added test init/add datasets Signed-off-by: Elena Khaustova * Updated test_adding_datasets_not_allowed Signed-off-by: Elena Khaustova * Added shallow copy tests Signed-off-by: Elena Khaustova * Added TestKedroDataCatalogFromConfig Signed-off-by: Elena Khaustova * Added missing tests Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 66 +++ tests/io/test_data_catalog.py | 70 --- tests/io/test_kedro_data_catalog.py | 689 ++++++++++++++++++++++++++++ 3 files changed, 755 insertions(+), 70 deletions(-) create mode 100644 tests/io/test_kedro_data_catalog.py diff --git a/tests/io/conftest.py b/tests/io/conftest.py index 2cc38aa1ea..f6d6b1191e 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +from kedro_datasets.pandas import CSVDataset @pytest.fixture @@ -21,3 +22,68 @@ def input_data(request): @pytest.fixture def new_data(): return pd.DataFrame({"col1": ["a", "b"], "col2": ["c", "d"], "col3": ["e", "f"]}) + + +@pytest.fixture +def filepath(tmp_path): + return (tmp_path / "some" / "dir" / "test.csv").as_posix() + + +@pytest.fixture +def dataset(filepath): + return CSVDataset(filepath=filepath, save_args={"index": False}) + + +@pytest.fixture +def sane_config(filepath): + return { + "catalog": { + "boats": {"type": "pandas.CSVDataset", "filepath": filepath}, + "cars": { + "type": "pandas.CSVDataset", + "filepath": "s3://test_bucket/test_file.csv", + "credentials": "s3_credentials", + }, + }, + "credentials": { + "s3_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} + }, + } + + +@pytest.fixture +def sane_config_with_nested_creds(sane_config): + sane_config["catalog"]["cars"]["credentials"] = { + "client_kwargs": {"credentials": "other_credentials"}, + "key": "secret", + } + sane_config["credentials"]["other_credentials"] = { + "client_kwargs": { + "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", + "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", + } + } + return sane_config + + +@pytest.fixture +def bad_config(filepath): + return { + "bad": {"type": "tests.io.test_data_catalog.BadDataset", "filepath": filepath} + } + + +@pytest.fixture +def sane_config_with_tracking_ds(tmp_path): + boat_path = (tmp_path / "some" / "dir" / "test.csv").as_posix() + plane_path = (tmp_path / "some" / "dir" / "metrics.json").as_posix() + return { + "catalog": { + "boats": { + "type": "pandas.CSVDataset", + "filepath": boat_path, + "versioned": True, + }, + "planes": {"type": "tracking.MetricsDataset", "filepath": plane_path}, + }, + } diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index db777cc634..ee5c760833 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -29,64 +29,6 @@ ) -@pytest.fixture -def filepath(tmp_path): - return (tmp_path / "some" / "dir" / "test.csv").as_posix() - - -@pytest.fixture -def dummy_dataframe(): - return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]}) - - -@pytest.fixture -def sane_config(filepath): - return { - "catalog": { - "boats": {"type": "pandas.CSVDataset", "filepath": filepath}, - "cars": { - "type": "pandas.CSVDataset", - "filepath": "s3://test_bucket/test_file.csv", - "credentials": "s3_credentials", - }, - }, - "credentials": { - "s3_credentials": {"key": "FAKE_ACCESS_KEY", "secret": "FAKE_SECRET_KEY"} - }, - } - - -@pytest.fixture -def sane_config_with_nested_creds(sane_config): - sane_config["catalog"]["cars"]["credentials"] = { - "client_kwargs": {"credentials": "other_credentials"}, - "key": "secret", - } - sane_config["credentials"]["other_credentials"] = { - "client_kwargs": { - "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", - "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", - } - } - return sane_config - - -@pytest.fixture -def sane_config_with_tracking_ds(tmp_path): - boat_path = (tmp_path / "some" / "dir" / "test.csv").as_posix() - plane_path = (tmp_path / "some" / "dir" / "metrics.json").as_posix() - return { - "catalog": { - "boats": { - "type": "pandas.CSVDataset", - "filepath": boat_path, - "versioned": True, - }, - "planes": {"type": "tracking.MetricsDataset", "filepath": plane_path}, - }, - } - - @pytest.fixture def config_with_dataset_factories(): return { @@ -180,11 +122,6 @@ def config_with_dataset_factories_only_patterns_no_default( return config_with_dataset_factories_only_patterns -@pytest.fixture -def dataset(filepath): - return CSVDataset(filepath=filepath, save_args={"index": False}) - - @pytest.fixture def multi_catalog(): csv = CSVDataset(filepath="abc.csv") @@ -220,13 +157,6 @@ def _describe(self): return {} -@pytest.fixture -def bad_config(filepath): - return { - "bad": {"type": "tests.io.test_data_catalog.BadDataset", "filepath": filepath} - } - - @pytest.fixture def data_catalog(dataset): return DataCatalog(datasets={"test": dataset}) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py new file mode 100644 index 0000000000..de05af7cec --- /dev/null +++ b/tests/io/test_kedro_data_catalog.py @@ -0,0 +1,689 @@ +import logging +import re +import sys +from copy import deepcopy +from datetime import datetime, timezone +from pathlib import Path + +import pandas as pd +import pytest +from kedro_datasets.pandas import CSVDataset, ParquetDataset +from pandas.testing import assert_frame_equal + +from kedro.io import ( + DatasetAlreadyExistsError, + DatasetError, + DatasetNotFoundError, + KedroDataCatalog, + LambdaDataset, + MemoryDataset, +) +from kedro.io.core import ( + _DEFAULT_PACKAGES, + VERSION_FORMAT, + generate_timestamp, + parse_dataset_definition, +) + + +@pytest.fixture +def data_catalog(dataset): + return KedroDataCatalog(datasets={"test": dataset}) + + +@pytest.fixture +def memory_catalog(): + ds1 = MemoryDataset({"data": 42}) + ds2 = MemoryDataset([1, 2, 3, 4, 5]) + return KedroDataCatalog({"ds1": ds1, "ds2": ds2}) + + +@pytest.fixture +def conflicting_feed_dict(): + return {"ds1": 0, "ds3": 1} + + +@pytest.fixture +def multi_catalog(): + csv = CSVDataset(filepath="abc.csv") + parq = ParquetDataset(filepath="xyz.parq") + return KedroDataCatalog({"abc": csv, "xyz": parq}) + + +@pytest.fixture +def data_catalog_from_config(sane_config): + return KedroDataCatalog.from_config(**sane_config) + + +class TestKedroDataCatalog: + def test_save_and_load(self, data_catalog, dummy_dataframe): + """Test saving and reloading the data set""" + data_catalog.save("test", dummy_dataframe) + reloaded_df = data_catalog.load("test") + + assert_frame_equal(reloaded_df, dummy_dataframe) + + def test_add_save_and_load(self, dataset, dummy_dataframe): + """Test adding and then saving and reloading the data set""" + catalog = KedroDataCatalog(datasets={}) + catalog.add("test", dataset) + catalog.save("test", dummy_dataframe) + reloaded_df = catalog.load("test") + + assert_frame_equal(reloaded_df, dummy_dataframe) + + def test_load_error(self, data_catalog): + """Check the error when attempting to load a data set + from nonexistent source""" + pattern = r"Failed while loading data from data set CSVDataset" + with pytest.raises(DatasetError, match=pattern): + data_catalog.load("test") + + def test_add_dataset_twice(self, data_catalog, dataset): + """Check the error when attempting to add the data set twice""" + pattern = r"Dataset 'test' has already been registered" + with pytest.raises(DatasetAlreadyExistsError, match=pattern): + data_catalog.add("test", dataset) + + def test_load_from_unregistered(self): + """Check the error when attempting to load unregistered data set""" + catalog = KedroDataCatalog(datasets={}) + pattern = r"Dataset 'test' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern): + catalog.load("test") + + def test_save_to_unregistered(self, dummy_dataframe): + """Check the error when attempting to save to unregistered data set""" + catalog = KedroDataCatalog(datasets={}) + pattern = r"Dataset 'test' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern): + catalog.save("test", dummy_dataframe) + + def test_feed_dict(self, memory_catalog, conflicting_feed_dict): + """Test feed dict overriding some of the data sets""" + assert "data" in memory_catalog.load("ds1") + memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True) + assert memory_catalog.load("ds1") == 0 + assert isinstance(memory_catalog.load("ds2"), list) + assert memory_catalog.load("ds3") == 1 + + def test_exists(self, data_catalog, dummy_dataframe): + """Test `exists` method invocation""" + assert not data_catalog.exists("test") + data_catalog.save("test", dummy_dataframe) + assert data_catalog.exists("test") + + def test_exists_not_implemented(self, caplog): + """Test calling `exists` on the data set, which didn't implement it""" + catalog = KedroDataCatalog(datasets={"test": LambdaDataset(None, None)}) + result = catalog.exists("test") + + log_record = caplog.records[0] + assert log_record.levelname == "WARNING" + assert ( + "'exists()' not implemented for 'LambdaDataset'. " + "Assuming output does not exist." in log_record.message + ) + assert result is False + + def test_exists_invalid(self, data_catalog): + """Check the error when calling `exists` on invalid data set""" + assert not data_catalog.exists("wrong_key") + + def test_release_unregistered(self, data_catalog): + """Check the error when calling `release` on unregistered data set""" + pattern = r"Dataset \'wrong_key\' not found in the catalog" + with pytest.raises(DatasetNotFoundError, match=pattern) as e: + data_catalog.release("wrong_key") + assert "did you mean" not in str(e.value) + + def test_release_unregistered_typo(self, data_catalog): + """Check the error when calling `release` on mistyped data set""" + pattern = ( + "Dataset 'text' not found in the catalog" + " - did you mean one of these instead: test" + ) + with pytest.raises(DatasetNotFoundError, match=re.escape(pattern)): + data_catalog.release("text") + + def test_multi_catalog_list(self, multi_catalog): + """Test data catalog which contains multiple data sets""" + entries = multi_catalog.list() + assert "abc" in entries + assert "xyz" in entries + + @pytest.mark.parametrize( + "pattern,expected", + [ + ("^a", ["abc"]), + ("a|x", ["abc", "xyz"]), + ("^(?!(a|x))", []), + ("def", []), + ("", []), + ], + ) + def test_multi_catalog_list_regex(self, multi_catalog, pattern, expected): + """Test that regex patterns filter data sets accordingly""" + assert multi_catalog.list(regex_search=pattern) == expected + + def test_multi_catalog_list_bad_regex(self, multi_catalog): + """Test that bad regex is caught accordingly""" + escaped_regex = r"\(\(" + pattern = f"Invalid regular expression provided: '{escaped_regex}'" + with pytest.raises(SyntaxError, match=pattern): + multi_catalog.list("((") + + def test_eq(self, multi_catalog, data_catalog): + assert multi_catalog == multi_catalog.shallow_copy() + assert multi_catalog != data_catalog + + def test_datasets_on_init(self, data_catalog_from_config): + """Check datasets are loaded correctly on construction""" + assert isinstance(data_catalog_from_config["boats"], CSVDataset) + assert isinstance(data_catalog_from_config["cars"], CSVDataset) + + def test_datasets_on_add(self, data_catalog_from_config): + """Check datasets are updated correctly after adding""" + data_catalog_from_config.add("new_dataset", CSVDataset(filepath="some_path")) + assert isinstance(data_catalog_from_config["new_dataset"], CSVDataset) + assert isinstance(data_catalog_from_config["boats"], CSVDataset) + + def test_adding_datasets_not_allowed(self, data_catalog_from_config): + """Check error if user tries to update the datasets attribute""" + pattern = r"Operation not allowed! Please use KedroDataCatalog.add\(\) instead." + with pytest.raises(AttributeError, match=pattern): + data_catalog_from_config["new_dataset"] = None + + def test_mutating_datasets_not_allowed(self, data_catalog_from_config): + """Check error if user tries to update the datasets attribute""" + pattern = "Operation not allowed! Please change datasets through configuration." + with pytest.raises(AttributeError, match=pattern): + data_catalog_from_config["boats"] = None + + def test_confirm(self, mocker, caplog): + """Confirm the dataset""" + with caplog.at_level(logging.INFO): + mock_ds = mocker.Mock() + data_catalog = KedroDataCatalog(datasets={"mocked": mock_ds}) + data_catalog.confirm("mocked") + mock_ds.confirm.assert_called_once_with() + assert caplog.record_tuples == [ + ( + "kedro.io.kedro_data_catalog", + logging.INFO, + "Confirming dataset 'mocked'", + ) + ] + + @pytest.mark.parametrize( + "dataset_name,error_pattern", + [ + ("missing", "Dataset 'missing' not found in the catalog"), + ("test", "Dataset 'test' does not have 'confirm' method"), + ], + ) + def test_bad_confirm(self, data_catalog, dataset_name, error_pattern): + """Test confirming a non-existent dataset or one that + does not have `confirm` method""" + with pytest.raises(DatasetError, match=re.escape(error_pattern)): + data_catalog.confirm(dataset_name) + + def test_shallow_copy_returns_correct_class_type( + self, + ): + class MyDataCatalog(KedroDataCatalog): + pass + + data_catalog = MyDataCatalog() + copy = data_catalog.shallow_copy() + assert isinstance(copy, MyDataCatalog) + + @pytest.mark.parametrize( + "runtime_patterns,sorted_keys_expected", + [ + ( + { + "{default}": {"type": "MemoryDataset"}, + "{another}#csv": { + "type": "pandas.CSVDataset", + "filepath": "data/{another}.csv", + }, + }, + ["{another}#csv", "{default}"], + ) + ], + ) + def test_shallow_copy_adds_patterns( + self, data_catalog, runtime_patterns, sorted_keys_expected + ): + assert not data_catalog.config_resolver.list_patterns() + data_catalog = data_catalog.shallow_copy(runtime_patterns) + assert data_catalog.config_resolver.list_patterns() == sorted_keys_expected + + def test_key_completions(self, data_catalog_from_config): + """Test catalog.datasets key completions""" + assert isinstance(data_catalog_from_config.datasets["boats"], CSVDataset) + assert isinstance(data_catalog_from_config.datasets["cars"], CSVDataset) + data_catalog_from_config.add_feed_dict( + { + "params:model_options": [1, 2, 4], + "params:model_options.random_state": [0, 42, 67], + } + ) + assert isinstance( + data_catalog_from_config.datasets["params:model_options"], MemoryDataset + ) + assert set(data_catalog_from_config._ipython_key_completions_()) == { + "boats", + "cars", + "params:model_options", + "params:model_options.random_state", + } + + def test_init_with_raw_data(self, dummy_dataframe, dataset): + """Test catalog initialisation with raw data""" + catalog = KedroDataCatalog( + datasets={"ds": dataset}, raw_data={"df": dummy_dataframe} + ) + assert "ds" in catalog + assert "df" in catalog + assert isinstance(catalog["ds"], CSVDataset) + assert isinstance(catalog["df"], MemoryDataset) + + def test_set_datasets_not_allowed(self, data_catalog_from_config): + """Check error if user tries to modify datasets attribute""" + pattern = "Operation not allowed! Please change datasets through configuration." + with pytest.raises(AttributeError, match=pattern): + data_catalog_from_config.datasets = None + + def test_repr(self, data_catalog): + assert data_catalog.__repr__() == str(data_catalog) + + def test_iter(self, data_catalog): + assert list(data_catalog._datasets.values()) == [ds for ds in data_catalog] + + def test_missing_keys_from_load_versions(self, sane_config): + """Test load versions include keys missing in the catalog""" + pattern = "'load_versions' keys [version] are not found in the catalog." + with pytest.raises(DatasetNotFoundError, match=re.escape(pattern)): + KedroDataCatalog.from_config( + **sane_config, load_versions={"version": "test_version"} + ) + + def test_get_dataset_matching_pattern(self, data_catalog): + """Test get_dataset() when dataset is not in the catalog but pattern matches""" + match_pattern_ds = "match_pattern_ds" + assert match_pattern_ds not in data_catalog + data_catalog.config_resolver.add_runtime_patterns( + {"{default}": {"type": "MemoryDataset"}} + ) + ds = data_catalog.get_dataset(match_pattern_ds) + assert isinstance(ds, MemoryDataset) + + def test_release(self, data_catalog): + """Test release is called without errors""" + data_catalog.release("test") + + class TestKedroDataCatalogFromConfig: + def test_from_sane_config(self, data_catalog_from_config, dummy_dataframe): + """Test populating the data catalog from config""" + data_catalog_from_config.save("boats", dummy_dataframe) + reloaded_df = data_catalog_from_config.load("boats") + assert_frame_equal(reloaded_df, dummy_dataframe) + + def test_config_missing_type(self, sane_config): + """Check the error if type attribute is missing for some data set(s) + in the config""" + del sane_config["catalog"]["boats"]["type"] + pattern = ( + "An exception occurred when parsing config for dataset 'boats':\n" + "'type' is missing from dataset catalog configuration" + ) + with pytest.raises(DatasetError, match=re.escape(pattern)): + KedroDataCatalog.from_config(**sane_config) + + def test_config_invalid_module(self, sane_config): + """Check the error if the type points to nonexistent module""" + sane_config["catalog"]["boats"]["type"] = ( + "kedro.invalid_module_name.io.CSVDataset" + ) + + error_msg = "Class 'kedro.invalid_module_name.io.CSVDataset' not found" + with pytest.raises(DatasetError, match=re.escape(error_msg)): + KedroDataCatalog.from_config(**sane_config) + + def test_config_relative_import(self, sane_config): + """Check the error if the type points to a relative import""" + sane_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" + + pattern = "'type' class path does not support relative paths" + with pytest.raises(DatasetError, match=re.escape(pattern)): + KedroDataCatalog.from_config(**sane_config) + + def test_config_import_kedro_datasets(self, sane_config, mocker): + """Test kedro_datasets default path to the dataset class""" + # Spy _load_obj because kedro_datasets is not installed and we can't import it. + + import kedro.io.core + + spy = mocker.spy(kedro.io.core, "_load_obj") + parse_dataset_definition(sane_config["catalog"]["boats"]) + for prefix, call_args in zip(_DEFAULT_PACKAGES, spy.call_args_list): + # In Python 3.7 call_args.args is not available thus we access the call + # arguments with less meaningful index. + # The 1st index returns a tuple, the 2nd index return the name of module. + assert call_args[0][0] == f"{prefix}pandas.CSVDataset" + + def test_config_import_extras(self, sane_config): + """Test kedro_datasets default path to the dataset class""" + sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" + assert KedroDataCatalog.from_config(**sane_config) + + def test_config_missing_class(self, sane_config): + """Check the error if the type points to nonexistent class""" + sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" + + pattern = ( + "An exception occurred when parsing config for dataset 'boats':\n" + "Class 'kedro.io.CSVDatasetInvalid' not found, is this a typo?" + ) + with pytest.raises(DatasetError, match=re.escape(pattern)): + KedroDataCatalog.from_config(**sane_config) + + @pytest.mark.skipif( + sys.version_info < (3, 9), + reason="for python 3.8 kedro-datasets version 1.8 is used which has the old spelling", + ) + def test_config_incorrect_spelling(self, sane_config): + """Check hint if the type uses the old DataSet spelling""" + sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" + + pattern = ( + "An exception occurred when parsing config for dataset 'boats':\n" + "Class 'pandas.CSVDataSet' not found, is this a typo?" + "\nHint: If you are trying to use a dataset from `kedro-datasets`>=2.0.0," + " make sure that the dataset name uses the `Dataset` spelling instead of `DataSet`." + ) + with pytest.raises(DatasetError, match=re.escape(pattern)): + KedroDataCatalog.from_config(**sane_config) + + def test_config_invalid_dataset(self, sane_config): + """Check the error if the type points to invalid class""" + sane_config["catalog"]["boats"]["type"] = "KedroDataCatalog" + pattern = ( + "An exception occurred when parsing config for dataset 'boats':\n" + "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " + "all data set types must extend 'AbstractDataset'" + ) + with pytest.raises(DatasetError, match=re.escape(pattern)): + KedroDataCatalog.from_config(**sane_config) + + def test_config_invalid_arguments(self, sane_config): + """Check the error if the data set config contains invalid arguments""" + sane_config["catalog"]["boats"]["save_and_load_args"] = False + pattern = ( + r"Dataset 'boats' must only contain arguments valid for " + r"the constructor of '.*CSVDataset'" + ) + with pytest.raises(DatasetError, match=pattern): + KedroDataCatalog.from_config(**sane_config) + + def test_config_invalid_dataset_config(self, sane_config): + sane_config["catalog"]["invalid_entry"] = "some string" + pattern = ( + "Catalog entry 'invalid_entry' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + with pytest.raises(DatasetError, match=pattern): + KedroDataCatalog.from_config(**sane_config) + + def test_empty_config(self): + """Test empty config""" + assert KedroDataCatalog.from_config(None) + + def test_missing_credentials(self, sane_config): + """Check the error if credentials can't be located""" + sane_config["catalog"]["cars"]["credentials"] = "missing" + with pytest.raises( + KeyError, match=r"Unable to find credentials \'missing\'" + ): + KedroDataCatalog.from_config(**sane_config) + + def test_link_credentials(self, sane_config, mocker): + """Test credentials being linked to the relevant data set""" + mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") + config = deepcopy(sane_config) + del config["catalog"]["boats"] + + KedroDataCatalog.from_config(**config) + + expected_client_kwargs = sane_config["credentials"]["s3_credentials"] + mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs) + + def test_nested_credentials(self, sane_config_with_nested_creds, mocker): + mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") + config = deepcopy(sane_config_with_nested_creds) + del config["catalog"]["boats"] + KedroDataCatalog.from_config(**config) + + expected_client_kwargs = { + "client_kwargs": { + "credentials": { + "client_kwargs": { + "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", + "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", + } + } + }, + "key": "secret", + } + mock_client.filesystem.assert_called_once_with( + "s3", **expected_client_kwargs + ) + + def test_missing_nested_credentials(self, sane_config_with_nested_creds): + del sane_config_with_nested_creds["credentials"]["other_credentials"] + pattern = "Unable to find credentials 'other_credentials'" + with pytest.raises(KeyError, match=pattern): + KedroDataCatalog.from_config(**sane_config_with_nested_creds) + + def test_missing_dependency(self, sane_config, mocker): + """Test that dependency is missing.""" + pattern = "dependency issue" + + def dummy_load(obj_path, *args, **kwargs): + if obj_path == "kedro_datasets.pandas.CSVDataset": + raise AttributeError(pattern) + if obj_path == "kedro_datasets.pandas.__all__": + return ["CSVDataset"] + + mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) + with pytest.raises(DatasetError, match=pattern): + KedroDataCatalog.from_config(**sane_config) + + def test_idempotent_catalog(self, sane_config): + """Test that data catalog instantiations are idempotent""" + _ = KedroDataCatalog.from_config(**sane_config) + catalog = KedroDataCatalog.from_config(**sane_config) + assert catalog + + def test_error_dataset_init(self, bad_config): + """Check the error when trying to instantiate erroneous data set""" + pattern = r"Failed to instantiate dataset \'bad\' of type '.*BadDataset'" + with pytest.raises(DatasetError, match=pattern): + KedroDataCatalog.from_config(bad_config, None) + + def test_validate_dataset_config(self): + """Test _validate_dataset_config raises error when wrong dataset config type is passed""" + pattern = ( + "Catalog entry 'bad' is not a valid dataset configuration. \n" + "Hint: If this catalog entry is intended for variable interpolation, make sure that the key is preceded by an underscore." + ) + with pytest.raises(DatasetError, match=pattern): + KedroDataCatalog._validate_dataset_config( + ds_name="bad", ds_config="not_dict" + ) + + def test_confirm(self, tmp_path, caplog, mocker): + """Confirm the dataset""" + with caplog.at_level(logging.INFO): + mock_confirm = mocker.patch( + "kedro_datasets.partitions.incremental_dataset.IncrementalDataset.confirm" + ) + catalog = { + "ds_to_confirm": { + "type": "kedro_datasets.partitions.incremental_dataset.IncrementalDataset", + "dataset": "pandas.CSVDataset", + "path": str(tmp_path), + } + } + data_catalog = KedroDataCatalog.from_config(catalog=catalog) + data_catalog.confirm("ds_to_confirm") + assert caplog.record_tuples == [ + ( + "kedro.io.kedro_data_catalog", + logging.INFO, + "Confirming dataset 'ds_to_confirm'", + ) + ] + mock_confirm.assert_called_once_with() + + @pytest.mark.parametrize( + "dataset_name,pattern", + [ + ("missing", "Dataset 'missing' not found in the catalog"), + ("boats", "Dataset 'boats' does not have 'confirm' method"), + ], + ) + def test_bad_confirm(self, sane_config, dataset_name, pattern): + """Test confirming non existent dataset or the one that + does not have `confirm` method""" + data_catalog = KedroDataCatalog.from_config(**sane_config) + with pytest.raises(DatasetError, match=re.escape(pattern)): + data_catalog.confirm(dataset_name) + + class TestDataCatalogVersioned: + def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): + """Test load and save of versioned data sets from config""" + sane_config["catalog"]["boats"]["versioned"] = True + + # Decompose `generate_timestamp` to keep `current_ts` reference. + current_ts = datetime.now(tz=timezone.utc) + fmt = ( + "{d.year:04d}-{d.month:02d}-{d.day:02d}T{d.hour:02d}" + ".{d.minute:02d}.{d.second:02d}.{ms:03d}Z" + ) + version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) + + catalog = KedroDataCatalog.from_config( + **sane_config, + load_versions={"boats": version}, + save_version=version, + ) + + catalog.save("boats", dummy_dataframe) + path = Path(sane_config["catalog"]["boats"]["filepath"]) + path = path / version / path.name + assert path.is_file() + + reloaded_df = catalog.load("boats") + assert_frame_equal(reloaded_df, dummy_dataframe) + + reloaded_df_version = catalog.load("boats", version=version) + assert_frame_equal(reloaded_df_version, dummy_dataframe) + + # Verify that `VERSION_FORMAT` can help regenerate `current_ts`. + actual_timestamp = datetime.strptime( + catalog["boats"].resolve_load_version(), + VERSION_FORMAT, + ) + expected_timestamp = current_ts.replace( + microsecond=current_ts.microsecond // 1000 * 1000, tzinfo=None + ) + assert actual_timestamp == expected_timestamp + + @pytest.mark.parametrize("versioned", [True, False]) + def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): + """Check the warning if `version` attribute was added + to the data set config""" + sane_config["catalog"]["boats"]["versioned"] = versioned + sane_config["catalog"]["boats"]["version"] = True + KedroDataCatalog.from_config(**sane_config) + log_record = caplog.records[0] + expected_log_message = ( + "'version' attribute removed from data set configuration since it " + "is a reserved word and cannot be directly specified" + ) + assert log_record.levelname == "WARNING" + assert expected_log_message in log_record.message + + def test_from_sane_config_load_versions_warn(self, sane_config): + sane_config["catalog"]["boats"]["versioned"] = True + version = generate_timestamp() + load_version = {"non-boart": version} + pattern = ( + r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." + ) + with pytest.raises(DatasetNotFoundError, match=pattern): + KedroDataCatalog.from_config(**sane_config, load_versions=load_version) + + def test_compare_tracking_and_other_dataset_versioned( + self, sane_config_with_tracking_ds, dummy_dataframe + ): + """Test saving of tracking data sets from config results in the same + save version as other versioned datasets.""" + + catalog = KedroDataCatalog.from_config(**sane_config_with_tracking_ds) + + catalog.save("boats", dummy_dataframe) + dummy_data = {"col1": 1, "col2": 2, "col3": 3} + catalog.save("planes", dummy_data) + + # Verify that saved version on tracking dataset is the same as on the CSV dataset + csv_timestamp = datetime.strptime( + catalog["boats"].resolve_save_version(), + VERSION_FORMAT, + ) + tracking_timestamp = datetime.strptime( + catalog["planes"].resolve_save_version(), + VERSION_FORMAT, + ) + + assert tracking_timestamp == csv_timestamp + + def test_load_version(self, sane_config, dummy_dataframe, mocker): + """Test load versioned data sets from config""" + new_dataframe = pd.DataFrame( + {"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]} + ) + sane_config["catalog"]["boats"]["versioned"] = True + mocker.patch( + "kedro.io.kedro_data_catalog.generate_timestamp", + side_effect=["first", "second"], + ) + + # save first version of the dataset + catalog = KedroDataCatalog.from_config(**sane_config) + catalog.save("boats", dummy_dataframe) + + # save second version of the dataset + catalog = KedroDataCatalog.from_config(**sane_config) + catalog.save("boats", new_dataframe) + + assert_frame_equal(catalog.load("boats", version="first"), dummy_dataframe) + assert_frame_equal(catalog.load("boats", version="second"), new_dataframe) + assert_frame_equal(catalog.load("boats"), new_dataframe) + + def test_load_version_on_unversioned_dataset( + self, sane_config, dummy_dataframe, mocker + ): + mocker.patch( + "kedro.io.kedro_data_catalog.generate_timestamp", return_value="first" + ) + + catalog = KedroDataCatalog.from_config(**sane_config) + catalog.save("boats", dummy_dataframe) + + with pytest.raises(DatasetError): + catalog.load("boats", version="first") From ae7a2712e9b2011c8c55d8be3975ed37fc32d63f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 17 Sep 2024 16:43:19 +0100 Subject: [PATCH 128/173] Updated RELEASE.md Signed-off-by: Elena Khaustova --- RELEASE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index f2f24f9a77..4375ee8672 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,12 @@ # Upcoming Release ## Major features and improvements +* Implemented `KedroDataCatalog` repeating `DataCatalog` functionality with a few API enhancements: + * Removed `_FrozenDatasets` and access datasets as properties; + * Added get dataset by name feature: dedicated function and access by key; + * Added iterate over the datasets feature; + * `add_feed_dict()` was simplified and renamed to `add_raw_data()`; + * Datasets' initialisation was moved out from `from_config()` method to the constructor. * Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations. * Refactored `kedro run` and `kedro catalog` commands. * Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally. From 135cb0e57f3c9ac45c4bad226ba6cbfab8c6a344 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 11:08:10 +0100 Subject: [PATCH 129/173] Removed deep copies Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 9949abdd95..4709e2270a 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -71,7 +71,7 @@ def __init__( @property def datasets(self) -> dict[str, Any]: - return copy.deepcopy(self._datasets) + return copy.copy(self._datasets) @datasets.setter def datasets(self, value: Any) -> None: @@ -90,7 +90,7 @@ def __iter__(self) -> AbstractDataset: yield from self._datasets.values() def __getitem__(self, ds_name: str) -> AbstractDataset: - return copy.deepcopy(self.get_dataset(ds_name)) + return self.get_dataset(ds_name) def __setitem__(self, key: str, value: Any) -> None: msg = "Operation not allowed!" From ca4867c45da88fae17ac6af9bb0f3d4159d51644 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 14:21:38 +0100 Subject: [PATCH 130/173] Removed some interface that will be changed in the next version Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 16 +------------ tests/io/test_kedro_data_catalog.py | 35 +++++++++-------------------- 2 files changed, 11 insertions(+), 40 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 4709e2270a..15943c34ae 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -76,7 +76,7 @@ def datasets(self) -> dict[str, Any]: @datasets.setter def datasets(self, value: Any) -> None: raise AttributeError( - "Operation not allowed! Please change datasets through configuration." + "Operation not allowed! Please use KedroDataCatalog.add() instead." ) @property @@ -86,20 +86,6 @@ def config_resolver(self) -> CatalogConfigResolver: def __repr__(self) -> str: return self._datasets.__repr__() - def __iter__(self) -> AbstractDataset: - yield from self._datasets.values() - - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self.get_dataset(ds_name) - - def __setitem__(self, key: str, value: Any) -> None: - msg = "Operation not allowed!" - if key in self: - msg = f"{msg} Please change datasets through configuration." - else: - msg = f"{msg} Please use KedroDataCatalog.add() instead." - raise AttributeError(msg) - def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" return ( diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index de05af7cec..dccb65dc76 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -179,26 +179,20 @@ def test_eq(self, multi_catalog, data_catalog): def test_datasets_on_init(self, data_catalog_from_config): """Check datasets are loaded correctly on construction""" - assert isinstance(data_catalog_from_config["boats"], CSVDataset) - assert isinstance(data_catalog_from_config["cars"], CSVDataset) + assert isinstance(data_catalog_from_config.datasets["boats"], CSVDataset) + assert isinstance(data_catalog_from_config.datasets["cars"], CSVDataset) def test_datasets_on_add(self, data_catalog_from_config): """Check datasets are updated correctly after adding""" data_catalog_from_config.add("new_dataset", CSVDataset(filepath="some_path")) - assert isinstance(data_catalog_from_config["new_dataset"], CSVDataset) - assert isinstance(data_catalog_from_config["boats"], CSVDataset) + assert isinstance(data_catalog_from_config.datasets["new_dataset"], CSVDataset) + assert isinstance(data_catalog_from_config.datasets["boats"], CSVDataset) def test_adding_datasets_not_allowed(self, data_catalog_from_config): """Check error if user tries to update the datasets attribute""" pattern = r"Operation not allowed! Please use KedroDataCatalog.add\(\) instead." with pytest.raises(AttributeError, match=pattern): - data_catalog_from_config["new_dataset"] = None - - def test_mutating_datasets_not_allowed(self, data_catalog_from_config): - """Check error if user tries to update the datasets attribute""" - pattern = "Operation not allowed! Please change datasets through configuration." - with pytest.raises(AttributeError, match=pattern): - data_catalog_from_config["boats"] = None + data_catalog_from_config.datasets = None def test_confirm(self, mocker, caplog): """Confirm the dataset""" @@ -287,21 +281,12 @@ def test_init_with_raw_data(self, dummy_dataframe, dataset): ) assert "ds" in catalog assert "df" in catalog - assert isinstance(catalog["ds"], CSVDataset) - assert isinstance(catalog["df"], MemoryDataset) - - def test_set_datasets_not_allowed(self, data_catalog_from_config): - """Check error if user tries to modify datasets attribute""" - pattern = "Operation not allowed! Please change datasets through configuration." - with pytest.raises(AttributeError, match=pattern): - data_catalog_from_config.datasets = None + assert isinstance(catalog.datasets["ds"], CSVDataset) + assert isinstance(catalog.datasets["df"], MemoryDataset) def test_repr(self, data_catalog): assert data_catalog.__repr__() == str(data_catalog) - def test_iter(self, data_catalog): - assert list(data_catalog._datasets.values()) == [ds for ds in data_catalog] - def test_missing_keys_from_load_versions(self, sane_config): """Test load versions include keys missing in the catalog""" pattern = "'load_versions' keys [version] are not found in the catalog." @@ -595,7 +580,7 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): # Verify that `VERSION_FORMAT` can help regenerate `current_ts`. actual_timestamp = datetime.strptime( - catalog["boats"].resolve_load_version(), + catalog.datasets["boats"].resolve_load_version(), VERSION_FORMAT, ) expected_timestamp = current_ts.replace( @@ -642,11 +627,11 @@ def test_compare_tracking_and_other_dataset_versioned( # Verify that saved version on tracking dataset is the same as on the CSV dataset csv_timestamp = datetime.strptime( - catalog["boats"].resolve_save_version(), + catalog.datasets["boats"].resolve_save_version(), VERSION_FORMAT, ) tracking_timestamp = datetime.strptime( - catalog["planes"].resolve_save_version(), + catalog.datasets["planes"].resolve_save_version(), VERSION_FORMAT, ) From 4745f71981f0e26f83485776e64230dc170c21b6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 14:24:26 +0100 Subject: [PATCH 131/173] Removed key completions Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 15943c34ae..d37a50ae02 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -99,9 +99,6 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) - def _ipython_key_completions_(self) -> list[str]: - return list(self._datasets.keys()) - @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From 033a0b7a36f74462bd0cb9eae655feb06848da45 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 14:42:32 +0100 Subject: [PATCH 132/173] Fixinf typos Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index d37a50ae02..da2556ee96 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -54,7 +54,7 @@ def __init__( by all data sets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when - sorted in lexicographical order.. + sorted in lexicographical order. """ self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} @@ -297,7 +297,7 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non def shallow_copy( self, extra_dataset_patterns: Patterns | None = None ) -> KedroDataCatalog: - # TODO: remove when old catalog + # TODO: remove when removing old catalog """Returns a shallow copy of the current object. Returns: From e74ffda4cdff79b8607cab0ef377cb237be9b105 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 16:42:01 +0100 Subject: [PATCH 133/173] Removed key completions test Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index dccb65dc76..7aabe2c1b3 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -254,26 +254,6 @@ def test_shallow_copy_adds_patterns( data_catalog = data_catalog.shallow_copy(runtime_patterns) assert data_catalog.config_resolver.list_patterns() == sorted_keys_expected - def test_key_completions(self, data_catalog_from_config): - """Test catalog.datasets key completions""" - assert isinstance(data_catalog_from_config.datasets["boats"], CSVDataset) - assert isinstance(data_catalog_from_config.datasets["cars"], CSVDataset) - data_catalog_from_config.add_feed_dict( - { - "params:model_options": [1, 2, 4], - "params:model_options.random_state": [0, 42, 67], - } - ) - assert isinstance( - data_catalog_from_config.datasets["params:model_options"], MemoryDataset - ) - assert set(data_catalog_from_config._ipython_key_completions_()) == { - "boats", - "cars", - "params:model_options", - "params:model_options.random_state", - } - def test_init_with_raw_data(self, dummy_dataframe, dataset): """Test catalog initialisation with raw data""" catalog = KedroDataCatalog( From 00af3ec88b015978e4d01c4268881c33703f54d6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 19:54:59 +0100 Subject: [PATCH 134/173] Replaced data set with dataset Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index da2556ee96..0b1b640671 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -39,19 +39,19 @@ def __init__( """``KedroDataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``KedroDataCatalog``, you need to - instantiate it with a dictionary of data sets. Then it will act as a + instantiate it with a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save - functions to the underlying data sets. + functions to the underlying datasets. Args: - datasets: A dictionary of data set names and data set instances. + datasets: A dictionary of dataset names and dataset instances. raw_data: A dictionary with data to be added in memory as `MemoryDataset`` instances. Keys represent dataset names and the values are raw data. config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. - load_versions: A mapping between data set names and versions - to load. Has no effect on data sets without enabled versioning. + load_versions: A mapping between dataset names and versions + to load. Has no effect on datasets without enabled versioning. save_version: Version string to be used for ``save`` operations - by all data sets with enabled versioning. It must: a) be a + by all datasets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. From 2de7ccb476fe4befed13932f141a33722157dda6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 20:11:51 +0100 Subject: [PATCH 135/173] Added docstring for get_dataset() method Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 0b1b640671..6066e94453 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -165,6 +165,25 @@ def _add_from_config(self, ds_name: str, ds_config: dict[str, Any]) -> None: def get_dataset( self, ds_name: str, version: Version | None = None, suggest: bool = True ) -> AbstractDataset: + """Get a dataset by name from an internal collection of datasets. + + If a dataset is not in the collection but matches any pattern + it is instantiated and added to the collection first, then returned. + + Args: + ds_name: A dataset name. + version: Optional argument for concrete dataset version to be loaded. + Works only with versioned datasets. + suggest: Optional argument whether to suggest fuzzy-matching datasets' names + in the DatasetNotFoundError message. + + Returns: + An instance of AbstractDataset. + + Raises: + DatasetNotFoundError: When a dataset with the given name + is not in the collection and do not match patterns. + """ ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) if ds_name not in self._datasets and ds_config: From 8affed6ad74010cb2ae0672e67c2e4b2ad27d24d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 18 Sep 2024 20:16:09 +0100 Subject: [PATCH 136/173] Renamed pytest fixture Signed-off-by: Elena Khaustova --- tests/io/conftest.py | 12 +-- tests/io/test_data_catalog.py | 140 +++++++++++++------------- tests/io/test_kedro_data_catalog.py | 146 ++++++++++++++-------------- 3 files changed, 152 insertions(+), 146 deletions(-) diff --git a/tests/io/conftest.py b/tests/io/conftest.py index f6d6b1191e..9abce4c83e 100644 --- a/tests/io/conftest.py +++ b/tests/io/conftest.py @@ -35,7 +35,7 @@ def dataset(filepath): @pytest.fixture -def sane_config(filepath): +def correct_config(filepath): return { "catalog": { "boats": {"type": "pandas.CSVDataset", "filepath": filepath}, @@ -52,18 +52,18 @@ def sane_config(filepath): @pytest.fixture -def sane_config_with_nested_creds(sane_config): - sane_config["catalog"]["cars"]["credentials"] = { +def correct_config_with_nested_creds(correct_config): + correct_config["catalog"]["cars"]["credentials"] = { "client_kwargs": {"credentials": "other_credentials"}, "key": "secret", } - sane_config["credentials"]["other_credentials"] = { + correct_config["credentials"]["other_credentials"] = { "client_kwargs": { "aws_access_key_id": "OTHER_FAKE_ACCESS_KEY", "aws_secret_access_key": "OTHER_FAKE_SECRET_KEY", } } - return sane_config + return correct_config @pytest.fixture @@ -74,7 +74,7 @@ def bad_config(filepath): @pytest.fixture -def sane_config_with_tracking_ds(tmp_path): +def correct_config_with_tracking_ds(tmp_path): boat_path = (tmp_path / "some" / "dir" / "test.csv").as_posix() plane_path = (tmp_path / "some" / "dir" / "metrics.json").as_posix() return { diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index ee5c760833..5fe0967260 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -163,8 +163,8 @@ def data_catalog(dataset): @pytest.fixture -def data_catalog_from_config(sane_config): - return DataCatalog.from_config(**sane_config) +def data_catalog_from_config(correct_config): + return DataCatalog.from_config(**correct_config) class TestDataCatalog: @@ -398,78 +398,78 @@ def test_key_completions(self, data_catalog_from_config): class TestDataCatalogFromConfig: - def test_from_sane_config(self, data_catalog_from_config, dummy_dataframe): + def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): """Test populating the data catalog from config""" data_catalog_from_config.save("boats", dummy_dataframe) reloaded_df = data_catalog_from_config.load("boats") assert_frame_equal(reloaded_df, dummy_dataframe) - def test_config_missing_type(self, sane_config): + def test_config_missing_type(self, correct_config): """Check the error if type attribute is missing for some data set(s) in the config""" - del sane_config["catalog"]["boats"]["type"] + del correct_config["catalog"]["boats"]["type"] pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "'type' is missing from dataset catalog configuration" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_invalid_module(self, sane_config): + def test_config_invalid_module(self, correct_config): """Check the error if the type points to nonexistent module""" - sane_config["catalog"]["boats"]["type"] = ( + correct_config["catalog"]["boats"]["type"] = ( "kedro.invalid_module_name.io.CSVDataset" ) error_msg = "Class 'kedro.invalid_module_name.io.CSVDataset' not found" with pytest.raises(DatasetError, match=re.escape(error_msg)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_relative_import(self, sane_config): + def test_config_relative_import(self, correct_config): """Check the error if the type points to a relative import""" - sane_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" + correct_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" pattern = "'type' class path does not support relative paths" with pytest.raises(DatasetError, match=re.escape(pattern)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_import_kedro_datasets(self, sane_config, mocker): + def test_config_import_kedro_datasets(self, correct_config, mocker): """Test kedro_datasets default path to the dataset class""" # Spy _load_obj because kedro_datasets is not installed and we can't import it. import kedro.io.core spy = mocker.spy(kedro.io.core, "_load_obj") - parse_dataset_definition(sane_config["catalog"]["boats"]) + parse_dataset_definition(correct_config["catalog"]["boats"]) for prefix, call_args in zip(_DEFAULT_PACKAGES, spy.call_args_list): # In Python 3.7 call_args.args is not available thus we access the call # arguments with less meaningful index. # The 1st index returns a tuple, the 2nd index return the name of module. assert call_args[0][0] == f"{prefix}pandas.CSVDataset" - def test_config_import_extras(self, sane_config): + def test_config_import_extras(self, correct_config): """Test kedro_datasets default path to the dataset class""" - sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" - assert DataCatalog.from_config(**sane_config) + correct_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" + assert DataCatalog.from_config(**correct_config) - def test_config_missing_class(self, sane_config): + def test_config_missing_class(self, correct_config): """Check the error if the type points to nonexistent class""" - sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" + correct_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Class 'kedro.io.CSVDatasetInvalid' not found, is this a typo?" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) @pytest.mark.skipif( sys.version_info < (3, 9), reason="for python 3.8 kedro-datasets version 1.8 is used which has the old spelling", ) - def test_config_incorrect_spelling(self, sane_config): + def test_config_incorrect_spelling(self, correct_config): """Check hint if the type uses the old DataSet spelling""" - sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" + correct_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" @@ -478,63 +478,63 @@ def test_config_incorrect_spelling(self, sane_config): " make sure that the dataset name uses the `Dataset` spelling instead of `DataSet`." ) with pytest.raises(DatasetError, match=re.escape(pattern)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_invalid_dataset(self, sane_config): + def test_config_invalid_dataset(self, correct_config): """Check the error if the type points to invalid class""" - sane_config["catalog"]["boats"]["type"] = "DataCatalog" + correct_config["catalog"]["boats"]["type"] = "DataCatalog" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.data_catalog.DataCatalog' is invalid: " "all data set types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_invalid_arguments(self, sane_config): + def test_config_invalid_arguments(self, correct_config): """Check the error if the data set config contains invalid arguments""" - sane_config["catalog"]["boats"]["save_and_load_args"] = False + correct_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( r"Dataset 'boats' must only contain arguments valid for " r"the constructor of '.*CSVDataset'" ) with pytest.raises(DatasetError, match=pattern): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_config_invalid_dataset_config(self, sane_config): - sane_config["catalog"]["invalid_entry"] = "some string" + def test_config_invalid_dataset_config(self, correct_config): + correct_config["catalog"]["invalid_entry"] = "some string" pattern = ( "Catalog entry 'invalid_entry' is not a valid dataset configuration. " "\nHint: If this catalog entry is intended for variable interpolation, " "make sure that the key is preceded by an underscore." ) with pytest.raises(DatasetError, match=pattern): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) def test_empty_config(self): """Test empty config""" assert DataCatalog.from_config(None) - def test_missing_credentials(self, sane_config): + def test_missing_credentials(self, correct_config): """Check the error if credentials can't be located""" - sane_config["catalog"]["cars"]["credentials"] = "missing" + correct_config["catalog"]["cars"]["credentials"] = "missing" with pytest.raises(KeyError, match=r"Unable to find credentials \'missing\'"): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_link_credentials(self, sane_config, mocker): + def test_link_credentials(self, correct_config, mocker): """Test credentials being linked to the relevant data set""" mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") - config = deepcopy(sane_config) + config = deepcopy(correct_config) del config["catalog"]["boats"] DataCatalog.from_config(**config) - expected_client_kwargs = sane_config["credentials"]["s3_credentials"] + expected_client_kwargs = correct_config["credentials"]["s3_credentials"] mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs) - def test_nested_credentials(self, sane_config_with_nested_creds, mocker): + def test_nested_credentials(self, correct_config_with_nested_creds, mocker): mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") - config = deepcopy(sane_config_with_nested_creds) + config = deepcopy(correct_config_with_nested_creds) del config["catalog"]["boats"] DataCatalog.from_config(**config) @@ -551,13 +551,13 @@ def test_nested_credentials(self, sane_config_with_nested_creds, mocker): } mock_client.filesystem.assert_called_once_with("s3", **expected_client_kwargs) - def test_missing_nested_credentials(self, sane_config_with_nested_creds): - del sane_config_with_nested_creds["credentials"]["other_credentials"] + def test_missing_nested_credentials(self, correct_config_with_nested_creds): + del correct_config_with_nested_creds["credentials"]["other_credentials"] pattern = "Unable to find credentials 'other_credentials'" with pytest.raises(KeyError, match=pattern): - DataCatalog.from_config(**sane_config_with_nested_creds) + DataCatalog.from_config(**correct_config_with_nested_creds) - def test_missing_dependency(self, sane_config, mocker): + def test_missing_dependency(self, correct_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" @@ -569,12 +569,12 @@ def dummy_load(obj_path, *args, **kwargs): mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) with pytest.raises(DatasetError, match=pattern): - DataCatalog.from_config(**sane_config) + DataCatalog.from_config(**correct_config) - def test_idempotent_catalog(self, sane_config): + def test_idempotent_catalog(self, correct_config): """Test that data catalog instantiations are idempotent""" - _ = DataCatalog.from_config(**sane_config) - catalog = DataCatalog.from_config(**sane_config) + _ = DataCatalog.from_config(**correct_config) + catalog = DataCatalog.from_config(**correct_config) assert catalog def test_error_dataset_init(self, bad_config): @@ -614,18 +614,18 @@ def test_confirm(self, tmp_path, caplog, mocker): ("boats", "Dataset 'boats' does not have 'confirm' method"), ], ) - def test_bad_confirm(self, sane_config, dataset_name, pattern): + def test_bad_confirm(self, correct_config, dataset_name, pattern): """Test confirming non existent dataset or the one that does not have `confirm` method""" - data_catalog = DataCatalog.from_config(**sane_config) + data_catalog = DataCatalog.from_config(**correct_config) with pytest.raises(DatasetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name) class TestDataCatalogVersioned: - def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): + def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): """Test load and save of versioned data sets from config""" - sane_config["catalog"]["boats"]["versioned"] = True + correct_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. current_ts = datetime.now(tz=timezone.utc) @@ -636,13 +636,13 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) catalog = DataCatalog.from_config( - **sane_config, + **correct_config, load_versions={"boats": version}, save_version=version, ) catalog.save("boats", dummy_dataframe) - path = Path(sane_config["catalog"]["boats"]["filepath"]) + path = Path(correct_config["catalog"]["boats"]["filepath"]) path = path / version / path.name assert path.is_file() @@ -663,12 +663,14 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): assert actual_timestamp == expected_timestamp @pytest.mark.parametrize("versioned", [True, False]) - def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): + def test_from_correct_config_versioned_warn( + self, caplog, correct_config, versioned + ): """Check the warning if `version` attribute was added to the data set config""" - sane_config["catalog"]["boats"]["versioned"] = versioned - sane_config["catalog"]["boats"]["version"] = True - DataCatalog.from_config(**sane_config) + correct_config["catalog"]["boats"]["versioned"] = versioned + correct_config["catalog"]["boats"]["version"] = True + DataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( "'version' attribute removed from data set configuration since it " @@ -677,21 +679,21 @@ def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): assert log_record.levelname == "WARNING" assert expected_log_message in log_record.message - def test_from_sane_config_load_versions_warn(self, sane_config): - sane_config["catalog"]["boats"]["versioned"] = True + def test_from_correct_config_load_versions_warn(self, correct_config): + correct_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() load_version = {"non-boart": version} pattern = r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." with pytest.raises(DatasetNotFoundError, match=pattern): - DataCatalog.from_config(**sane_config, load_versions=load_version) + DataCatalog.from_config(**correct_config, load_versions=load_version) def test_compare_tracking_and_other_dataset_versioned( - self, sane_config_with_tracking_ds, dummy_dataframe + self, correct_config_with_tracking_ds, dummy_dataframe ): """Test saving of tracking data sets from config results in the same save version as other versioned datasets.""" - catalog = DataCatalog.from_config(**sane_config_with_tracking_ds) + catalog = DataCatalog.from_config(**correct_config_with_tracking_ds) catalog.save("boats", dummy_dataframe) dummy_data = {"col1": 1, "col2": 2, "col3": 3} @@ -709,20 +711,20 @@ def test_compare_tracking_and_other_dataset_versioned( assert tracking_timestamp == csv_timestamp - def test_load_version(self, sane_config, dummy_dataframe, mocker): + def test_load_version(self, correct_config, dummy_dataframe, mocker): """Test load versioned data sets from config""" new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) - sane_config["catalog"]["boats"]["versioned"] = True + correct_config["catalog"]["boats"]["versioned"] = True mocker.patch( "kedro.io.data_catalog.generate_timestamp", side_effect=["first", "second"] ) # save first version of the dataset - catalog = DataCatalog.from_config(**sane_config) + catalog = DataCatalog.from_config(**correct_config) catalog.save("boats", dummy_dataframe) # save second version of the dataset - catalog = DataCatalog.from_config(**sane_config) + catalog = DataCatalog.from_config(**correct_config) catalog.save("boats", new_dataframe) assert_frame_equal(catalog.load("boats", version="first"), dummy_dataframe) @@ -730,11 +732,11 @@ def test_load_version(self, sane_config, dummy_dataframe, mocker): assert_frame_equal(catalog.load("boats"), new_dataframe) def test_load_version_on_unversioned_dataset( - self, sane_config, dummy_dataframe, mocker + self, correct_config, dummy_dataframe, mocker ): mocker.patch("kedro.io.data_catalog.generate_timestamp", return_value="first") - catalog = DataCatalog.from_config(**sane_config) + catalog = DataCatalog.from_config(**correct_config) catalog.save("boats", dummy_dataframe) with pytest.raises(DatasetError): diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 7aabe2c1b3..84705824be 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -51,8 +51,8 @@ def multi_catalog(): @pytest.fixture -def data_catalog_from_config(sane_config): - return KedroDataCatalog.from_config(**sane_config) +def data_catalog_from_config(correct_config): + return KedroDataCatalog.from_config(**correct_config) class TestKedroDataCatalog: @@ -267,12 +267,12 @@ def test_init_with_raw_data(self, dummy_dataframe, dataset): def test_repr(self, data_catalog): assert data_catalog.__repr__() == str(data_catalog) - def test_missing_keys_from_load_versions(self, sane_config): + def test_missing_keys_from_load_versions(self, correct_config): """Test load versions include keys missing in the catalog""" pattern = "'load_versions' keys [version] are not found in the catalog." with pytest.raises(DatasetNotFoundError, match=re.escape(pattern)): KedroDataCatalog.from_config( - **sane_config, load_versions={"version": "test_version"} + **correct_config, load_versions={"version": "test_version"} ) def test_get_dataset_matching_pattern(self, data_catalog): @@ -290,78 +290,78 @@ def test_release(self, data_catalog): data_catalog.release("test") class TestKedroDataCatalogFromConfig: - def test_from_sane_config(self, data_catalog_from_config, dummy_dataframe): + def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): """Test populating the data catalog from config""" data_catalog_from_config.save("boats", dummy_dataframe) reloaded_df = data_catalog_from_config.load("boats") assert_frame_equal(reloaded_df, dummy_dataframe) - def test_config_missing_type(self, sane_config): + def test_config_missing_type(self, correct_config): """Check the error if type attribute is missing for some data set(s) in the config""" - del sane_config["catalog"]["boats"]["type"] + del correct_config["catalog"]["boats"]["type"] pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "'type' is missing from dataset catalog configuration" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_invalid_module(self, sane_config): + def test_config_invalid_module(self, correct_config): """Check the error if the type points to nonexistent module""" - sane_config["catalog"]["boats"]["type"] = ( + correct_config["catalog"]["boats"]["type"] = ( "kedro.invalid_module_name.io.CSVDataset" ) error_msg = "Class 'kedro.invalid_module_name.io.CSVDataset' not found" with pytest.raises(DatasetError, match=re.escape(error_msg)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_relative_import(self, sane_config): + def test_config_relative_import(self, correct_config): """Check the error if the type points to a relative import""" - sane_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" + correct_config["catalog"]["boats"]["type"] = ".CSVDatasetInvalid" pattern = "'type' class path does not support relative paths" with pytest.raises(DatasetError, match=re.escape(pattern)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_import_kedro_datasets(self, sane_config, mocker): + def test_config_import_kedro_datasets(self, correct_config, mocker): """Test kedro_datasets default path to the dataset class""" # Spy _load_obj because kedro_datasets is not installed and we can't import it. import kedro.io.core spy = mocker.spy(kedro.io.core, "_load_obj") - parse_dataset_definition(sane_config["catalog"]["boats"]) + parse_dataset_definition(correct_config["catalog"]["boats"]) for prefix, call_args in zip(_DEFAULT_PACKAGES, spy.call_args_list): # In Python 3.7 call_args.args is not available thus we access the call # arguments with less meaningful index. # The 1st index returns a tuple, the 2nd index return the name of module. assert call_args[0][0] == f"{prefix}pandas.CSVDataset" - def test_config_import_extras(self, sane_config): + def test_config_import_extras(self, correct_config): """Test kedro_datasets default path to the dataset class""" - sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" - assert KedroDataCatalog.from_config(**sane_config) + correct_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" + assert KedroDataCatalog.from_config(**correct_config) - def test_config_missing_class(self, sane_config): + def test_config_missing_class(self, correct_config): """Check the error if the type points to nonexistent class""" - sane_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" + correct_config["catalog"]["boats"]["type"] = "kedro.io.CSVDatasetInvalid" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Class 'kedro.io.CSVDatasetInvalid' not found, is this a typo?" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) @pytest.mark.skipif( sys.version_info < (3, 9), reason="for python 3.8 kedro-datasets version 1.8 is used which has the old spelling", ) - def test_config_incorrect_spelling(self, sane_config): + def test_config_incorrect_spelling(self, correct_config): """Check hint if the type uses the old DataSet spelling""" - sane_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" + correct_config["catalog"]["boats"]["type"] = "pandas.CSVDataSet" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" @@ -370,65 +370,65 @@ def test_config_incorrect_spelling(self, sane_config): " make sure that the dataset name uses the `Dataset` spelling instead of `DataSet`." ) with pytest.raises(DatasetError, match=re.escape(pattern)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_invalid_dataset(self, sane_config): + def test_config_invalid_dataset(self, correct_config): """Check the error if the type points to invalid class""" - sane_config["catalog"]["boats"]["type"] = "KedroDataCatalog" + correct_config["catalog"]["boats"]["type"] = "KedroDataCatalog" pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " "all data set types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_invalid_arguments(self, sane_config): + def test_config_invalid_arguments(self, correct_config): """Check the error if the data set config contains invalid arguments""" - sane_config["catalog"]["boats"]["save_and_load_args"] = False + correct_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( r"Dataset 'boats' must only contain arguments valid for " r"the constructor of '.*CSVDataset'" ) with pytest.raises(DatasetError, match=pattern): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_config_invalid_dataset_config(self, sane_config): - sane_config["catalog"]["invalid_entry"] = "some string" + def test_config_invalid_dataset_config(self, correct_config): + correct_config["catalog"]["invalid_entry"] = "some string" pattern = ( "Catalog entry 'invalid_entry' is not a valid dataset configuration. " "\nHint: If this catalog entry is intended for variable interpolation, " "make sure that the key is preceded by an underscore." ) with pytest.raises(DatasetError, match=pattern): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) def test_empty_config(self): """Test empty config""" assert KedroDataCatalog.from_config(None) - def test_missing_credentials(self, sane_config): + def test_missing_credentials(self, correct_config): """Check the error if credentials can't be located""" - sane_config["catalog"]["cars"]["credentials"] = "missing" + correct_config["catalog"]["cars"]["credentials"] = "missing" with pytest.raises( KeyError, match=r"Unable to find credentials \'missing\'" ): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_link_credentials(self, sane_config, mocker): + def test_link_credentials(self, correct_config, mocker): """Test credentials being linked to the relevant data set""" mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") - config = deepcopy(sane_config) + config = deepcopy(correct_config) del config["catalog"]["boats"] KedroDataCatalog.from_config(**config) - expected_client_kwargs = sane_config["credentials"]["s3_credentials"] + expected_client_kwargs = correct_config["credentials"]["s3_credentials"] mock_client.filesystem.assert_called_with("s3", **expected_client_kwargs) - def test_nested_credentials(self, sane_config_with_nested_creds, mocker): + def test_nested_credentials(self, correct_config_with_nested_creds, mocker): mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") - config = deepcopy(sane_config_with_nested_creds) + config = deepcopy(correct_config_with_nested_creds) del config["catalog"]["boats"] KedroDataCatalog.from_config(**config) @@ -447,13 +447,13 @@ def test_nested_credentials(self, sane_config_with_nested_creds, mocker): "s3", **expected_client_kwargs ) - def test_missing_nested_credentials(self, sane_config_with_nested_creds): - del sane_config_with_nested_creds["credentials"]["other_credentials"] + def test_missing_nested_credentials(self, correct_config_with_nested_creds): + del correct_config_with_nested_creds["credentials"]["other_credentials"] pattern = "Unable to find credentials 'other_credentials'" with pytest.raises(KeyError, match=pattern): - KedroDataCatalog.from_config(**sane_config_with_nested_creds) + KedroDataCatalog.from_config(**correct_config_with_nested_creds) - def test_missing_dependency(self, sane_config, mocker): + def test_missing_dependency(self, correct_config, mocker): """Test that dependency is missing.""" pattern = "dependency issue" @@ -465,12 +465,12 @@ def dummy_load(obj_path, *args, **kwargs): mocker.patch("kedro.io.core.load_obj", side_effect=dummy_load) with pytest.raises(DatasetError, match=pattern): - KedroDataCatalog.from_config(**sane_config) + KedroDataCatalog.from_config(**correct_config) - def test_idempotent_catalog(self, sane_config): + def test_idempotent_catalog(self, correct_config): """Test that data catalog instantiations are idempotent""" - _ = KedroDataCatalog.from_config(**sane_config) - catalog = KedroDataCatalog.from_config(**sane_config) + _ = KedroDataCatalog.from_config(**correct_config) + catalog = KedroDataCatalog.from_config(**correct_config) assert catalog def test_error_dataset_init(self, bad_config): @@ -521,17 +521,17 @@ def test_confirm(self, tmp_path, caplog, mocker): ("boats", "Dataset 'boats' does not have 'confirm' method"), ], ) - def test_bad_confirm(self, sane_config, dataset_name, pattern): + def test_bad_confirm(self, correct_config, dataset_name, pattern): """Test confirming non existent dataset or the one that does not have `confirm` method""" - data_catalog = KedroDataCatalog.from_config(**sane_config) + data_catalog = KedroDataCatalog.from_config(**correct_config) with pytest.raises(DatasetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name) class TestDataCatalogVersioned: - def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): + def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): """Test load and save of versioned data sets from config""" - sane_config["catalog"]["boats"]["versioned"] = True + correct_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. current_ts = datetime.now(tz=timezone.utc) @@ -542,13 +542,13 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): version = fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) catalog = KedroDataCatalog.from_config( - **sane_config, + **correct_config, load_versions={"boats": version}, save_version=version, ) catalog.save("boats", dummy_dataframe) - path = Path(sane_config["catalog"]["boats"]["filepath"]) + path = Path(correct_config["catalog"]["boats"]["filepath"]) path = path / version / path.name assert path.is_file() @@ -569,12 +569,14 @@ def test_from_sane_config_versioned(self, sane_config, dummy_dataframe): assert actual_timestamp == expected_timestamp @pytest.mark.parametrize("versioned", [True, False]) - def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): + def test_from_correct_config_versioned_warn( + self, caplog, correct_config, versioned + ): """Check the warning if `version` attribute was added to the data set config""" - sane_config["catalog"]["boats"]["versioned"] = versioned - sane_config["catalog"]["boats"]["version"] = True - KedroDataCatalog.from_config(**sane_config) + correct_config["catalog"]["boats"]["versioned"] = versioned + correct_config["catalog"]["boats"]["version"] = True + KedroDataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( "'version' attribute removed from data set configuration since it " @@ -583,23 +585,25 @@ def test_from_sane_config_versioned_warn(self, caplog, sane_config, versioned): assert log_record.levelname == "WARNING" assert expected_log_message in log_record.message - def test_from_sane_config_load_versions_warn(self, sane_config): - sane_config["catalog"]["boats"]["versioned"] = True + def test_from_correct_config_load_versions_warn(self, correct_config): + correct_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() load_version = {"non-boart": version} pattern = ( r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." ) with pytest.raises(DatasetNotFoundError, match=pattern): - KedroDataCatalog.from_config(**sane_config, load_versions=load_version) + KedroDataCatalog.from_config( + **correct_config, load_versions=load_version + ) def test_compare_tracking_and_other_dataset_versioned( - self, sane_config_with_tracking_ds, dummy_dataframe + self, correct_config_with_tracking_ds, dummy_dataframe ): """Test saving of tracking data sets from config results in the same save version as other versioned datasets.""" - catalog = KedroDataCatalog.from_config(**sane_config_with_tracking_ds) + catalog = KedroDataCatalog.from_config(**correct_config_with_tracking_ds) catalog.save("boats", dummy_dataframe) dummy_data = {"col1": 1, "col2": 2, "col3": 3} @@ -617,23 +621,23 @@ def test_compare_tracking_and_other_dataset_versioned( assert tracking_timestamp == csv_timestamp - def test_load_version(self, sane_config, dummy_dataframe, mocker): + def test_load_version(self, correct_config, dummy_dataframe, mocker): """Test load versioned data sets from config""" new_dataframe = pd.DataFrame( {"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]} ) - sane_config["catalog"]["boats"]["versioned"] = True + correct_config["catalog"]["boats"]["versioned"] = True mocker.patch( "kedro.io.kedro_data_catalog.generate_timestamp", side_effect=["first", "second"], ) # save first version of the dataset - catalog = KedroDataCatalog.from_config(**sane_config) + catalog = KedroDataCatalog.from_config(**correct_config) catalog.save("boats", dummy_dataframe) # save second version of the dataset - catalog = KedroDataCatalog.from_config(**sane_config) + catalog = KedroDataCatalog.from_config(**correct_config) catalog.save("boats", new_dataframe) assert_frame_equal(catalog.load("boats", version="first"), dummy_dataframe) @@ -641,13 +645,13 @@ def test_load_version(self, sane_config, dummy_dataframe, mocker): assert_frame_equal(catalog.load("boats"), new_dataframe) def test_load_version_on_unversioned_dataset( - self, sane_config, dummy_dataframe, mocker + self, correct_config, dummy_dataframe, mocker ): mocker.patch( "kedro.io.kedro_data_catalog.generate_timestamp", return_value="first" ) - catalog = KedroDataCatalog.from_config(**sane_config) + catalog = KedroDataCatalog.from_config(**correct_config) catalog.save("boats", dummy_dataframe) with pytest.raises(DatasetError): From a52672e30e1efddff750630f520a0c1c5613c5ca Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 19 Sep 2024 23:45:02 +0100 Subject: [PATCH 137/173] Addressed review comments Signed-off-by: Elena Khaustova --- RELEASE.md | 2 +- kedro/io/kedro_data_catalog.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 4375ee8672..e874b2eb6f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -5,7 +5,7 @@ * Removed `_FrozenDatasets` and access datasets as properties; * Added get dataset by name feature: dedicated function and access by key; * Added iterate over the datasets feature; - * `add_feed_dict()` was simplified and renamed to `add_raw_data()`; + * `add_feed_dict()` was simplified and renamed to `add_data()`; * Datasets' initialisation was moved out from `from_config()` method to the constructor. * Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations. * Refactored `kedro run` and `kedro catalog` commands. diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 6066e94453..918bf76d8d 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -67,7 +67,7 @@ def __init__( self._add_from_config(ds_name, ds_config) if raw_data: - self.add_raw_data(raw_data) + self.add_data(raw_data) @property def datasets(self) -> dict[str, Any]: @@ -84,7 +84,7 @@ def config_resolver(self) -> CatalogConfigResolver: return self._config_resolver def __repr__(self) -> str: - return self._datasets.__repr__() + return repr(self._datasets) def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" @@ -184,10 +184,10 @@ def get_dataset( DatasetNotFoundError: When a dataset with the given name is not in the collection and do not match patterns. """ - ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) - - if ds_name not in self._datasets and ds_config: - self._add_from_config(ds_name, ds_config) + if ds_name not in self._datasets: + ds_config = self._config_resolver.resolve_dataset_pattern(ds_name) + if ds_config: + self._add_from_config(ds_name, ds_config) dataset = self._datasets.get(ds_name, None) @@ -303,7 +303,7 @@ def confirm(self, name: str) -> None: else: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - def add_raw_data(self, data: dict[str, Any], replace: bool = False) -> None: + def add_data(self, data: dict[str, Any], replace: bool = False) -> None: # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method for ds_name, ds_data in data.items(): @@ -311,7 +311,7 @@ def add_raw_data(self, data: dict[str, Any], replace: bool = False) -> None: def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog - return self.add_raw_data(feed_dict, replace) + return self.add_data(feed_dict, replace) def shallow_copy( self, extra_dataset_patterns: Patterns | None = None From 84f249cbc734c168fa750f81f2ef8256f17316a1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 14:40:15 +0100 Subject: [PATCH 138/173] Updated _assert_requirements_ok starters test Signed-off-by: Elena Khaustova --- tests/framework/cli/test_starters.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index 32f618d68f..7f2641da10 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -147,17 +147,11 @@ def _assert_requirements_ok( assert "Congratulations!" in result.output assert f"has been created in the directory \n{root_path}" in result.output - requirements_file_path = root_path / "requirements.txt" pyproject_file_path = root_path / "pyproject.toml" tools_list = _parse_tools_input(tools) if "1" in tools_list: - with open(requirements_file_path) as requirements_file: - requirements = requirements_file.read() - - assert "ruff" in requirements - pyproject_config = toml.load(pyproject_file_path) expected = { "tool": { @@ -171,15 +165,11 @@ def _assert_requirements_ok( } } assert expected["tool"]["ruff"] == pyproject_config["tool"]["ruff"] + assert ( + "ruff~=0.1.8" in pyproject_config["project"]["optional-dependencies"]["dev"] + ) if "2" in tools_list: - with open(requirements_file_path) as requirements_file: - requirements = requirements_file.read() - - assert "pytest-cov~=3.0" in requirements - assert "pytest-mock>=1.7.1, <2.0" in requirements - assert "pytest~=7.2" in requirements - pyproject_config = toml.load(pyproject_file_path) expected = { "pytest": { @@ -198,6 +188,18 @@ def _assert_requirements_ok( assert expected["pytest"] == pyproject_config["tool"]["pytest"] assert expected["coverage"] == pyproject_config["tool"]["coverage"] + assert ( + "pytest-cov~=3.0" + in pyproject_config["project"]["optional-dependencies"]["dev"] + ) + assert ( + "pytest-mock>=1.7.1, <2.0" + in pyproject_config["project"]["optional-dependencies"]["dev"] + ) + assert ( + "pytest~=7.2" in pyproject_config["project"]["optional-dependencies"]["dev"] + ) + if "4" in tools_list: pyproject_config = toml.load(pyproject_file_path) expected = { From 2548119d32b5d19deac429ad896e79be900c2e50 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 14:42:36 +0100 Subject: [PATCH 139/173] Revert "Updated _assert_requirements_ok starters test" This reverts commit 5208321a440432a9f11ec990c724b6ecf2fd7990. Signed-off-by: Elena Khaustova --- tests/framework/cli/test_starters.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/framework/cli/test_starters.py b/tests/framework/cli/test_starters.py index 7f2641da10..32f618d68f 100644 --- a/tests/framework/cli/test_starters.py +++ b/tests/framework/cli/test_starters.py @@ -147,11 +147,17 @@ def _assert_requirements_ok( assert "Congratulations!" in result.output assert f"has been created in the directory \n{root_path}" in result.output + requirements_file_path = root_path / "requirements.txt" pyproject_file_path = root_path / "pyproject.toml" tools_list = _parse_tools_input(tools) if "1" in tools_list: + with open(requirements_file_path) as requirements_file: + requirements = requirements_file.read() + + assert "ruff" in requirements + pyproject_config = toml.load(pyproject_file_path) expected = { "tool": { @@ -165,11 +171,15 @@ def _assert_requirements_ok( } } assert expected["tool"]["ruff"] == pyproject_config["tool"]["ruff"] - assert ( - "ruff~=0.1.8" in pyproject_config["project"]["optional-dependencies"]["dev"] - ) if "2" in tools_list: + with open(requirements_file_path) as requirements_file: + requirements = requirements_file.read() + + assert "pytest-cov~=3.0" in requirements + assert "pytest-mock>=1.7.1, <2.0" in requirements + assert "pytest~=7.2" in requirements + pyproject_config = toml.load(pyproject_file_path) expected = { "pytest": { @@ -188,18 +198,6 @@ def _assert_requirements_ok( assert expected["pytest"] == pyproject_config["tool"]["pytest"] assert expected["coverage"] == pyproject_config["tool"]["coverage"] - assert ( - "pytest-cov~=3.0" - in pyproject_config["project"]["optional-dependencies"]["dev"] - ) - assert ( - "pytest-mock>=1.7.1, <2.0" - in pyproject_config["project"]["optional-dependencies"]["dev"] - ) - assert ( - "pytest~=7.2" in pyproject_config["project"]["optional-dependencies"]["dev"] - ) - if "4" in tools_list: pyproject_config = toml.load(pyproject_file_path) expected = { From ac124e343191a7fe12ea8477347f1903cf4a07e8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 14:48:47 +0100 Subject: [PATCH 140/173] Updated error message Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 2 +- kedro/io/kedro_data_catalog.py | 2 +- tests/io/test_kedro_data_catalog.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 420f8857c8..1989030ae6 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -75,7 +75,7 @@ def __setattr__(self, key: str, value: Any) -> None: if key == "_original_names": super().__setattr__(key, value) return - msg = "Operation not allowed! " + msg = "Operation not allowed. " if key in self.__dict__: msg += "Please change datasets through configuration." else: diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 918bf76d8d..25b194736d 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -76,7 +76,7 @@ def datasets(self) -> dict[str, Any]: @datasets.setter def datasets(self, value: Any) -> None: raise AttributeError( - "Operation not allowed! Please use KedroDataCatalog.add() instead." + "Operation not allowed. Please use KedroDataCatalog.add() instead." ) @property diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 84705824be..cf4e73a410 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -190,7 +190,7 @@ def test_datasets_on_add(self, data_catalog_from_config): def test_adding_datasets_not_allowed(self, data_catalog_from_config): """Check error if user tries to update the datasets attribute""" - pattern = r"Operation not allowed! Please use KedroDataCatalog.add\(\) instead." + pattern = r"Operation not allowed. Please use KedroDataCatalog.add\(\) instead." with pytest.raises(AttributeError, match=pattern): data_catalog_from_config.datasets = None From f62ed03d1e28b6974f157b3b454f8e9e8d056a13 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 14:50:17 +0100 Subject: [PATCH 141/173] Replaced typo Signed-off-by: Elena Khaustova --- tests/io/test_data_catalog.py | 4 ++-- tests/io/test_kedro_data_catalog.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 5fe0967260..a552d8959c 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -682,8 +682,8 @@ def test_from_correct_config_versioned_warn( def test_from_correct_config_load_versions_warn(self, correct_config): correct_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() - load_version = {"non-boart": version} - pattern = r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." + load_version = {"non-boat": version} + pattern = r"\'load_versions\' keys \[non-boat\] are not found in the catalog\." with pytest.raises(DatasetNotFoundError, match=pattern): DataCatalog.from_config(**correct_config, load_versions=load_version) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index cf4e73a410..f9147c9a43 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -588,9 +588,9 @@ def test_from_correct_config_versioned_warn( def test_from_correct_config_load_versions_warn(self, correct_config): correct_config["catalog"]["boats"]["versioned"] = True version = generate_timestamp() - load_version = {"non-boart": version} + load_version = {"non-boat": version} pattern = ( - r"\'load_versions\' keys \[non-boart\] are not found in the catalog\." + r"\'load_versions\' keys \[non-boat\] are not found in the catalog\." ) with pytest.raises(DatasetNotFoundError, match=pattern): KedroDataCatalog.from_config( From b65609fcab22d0c1c75de6f8f57e15bf8620a04d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 14:52:57 +0100 Subject: [PATCH 142/173] Replaced data set with dataset in docstrings Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 48 ++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index f9147c9a43..2470c1ec60 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -57,14 +57,14 @@ def data_catalog_from_config(correct_config): class TestKedroDataCatalog: def test_save_and_load(self, data_catalog, dummy_dataframe): - """Test saving and reloading the data set""" + """Test saving and reloading the dataset""" data_catalog.save("test", dummy_dataframe) reloaded_df = data_catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe) def test_add_save_and_load(self, dataset, dummy_dataframe): - """Test adding and then saving and reloading the data set""" + """Test adding and then saving and reloading the dataset""" catalog = KedroDataCatalog(datasets={}) catalog.add("test", dataset) catalog.save("test", dummy_dataframe) @@ -73,34 +73,34 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_load_error(self, data_catalog): - """Check the error when attempting to load a data set + """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from data set CSVDataset" + pattern = r"Failed while loading data from dataset CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") def test_add_dataset_twice(self, data_catalog, dataset): - """Check the error when attempting to add the data set twice""" + """Check the error when attempting to add the dataset twice""" pattern = r"Dataset 'test' has already been registered" with pytest.raises(DatasetAlreadyExistsError, match=pattern): data_catalog.add("test", dataset) def test_load_from_unregistered(self): - """Check the error when attempting to load unregistered data set""" + """Check the error when attempting to load unregistered dataset""" catalog = KedroDataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.load("test") def test_save_to_unregistered(self, dummy_dataframe): - """Check the error when attempting to save to unregistered data set""" + """Check the error when attempting to save to unregistered dataset""" catalog = KedroDataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe) def test_feed_dict(self, memory_catalog, conflicting_feed_dict): - """Test feed dict overriding some of the data sets""" + """Test feed dict overriding some of the datasets""" assert "data" in memory_catalog.load("ds1") memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True) assert memory_catalog.load("ds1") == 0 @@ -114,7 +114,7 @@ def test_exists(self, data_catalog, dummy_dataframe): assert data_catalog.exists("test") def test_exists_not_implemented(self, caplog): - """Test calling `exists` on the data set, which didn't implement it""" + """Test calling `exists` on the dataset, which didn't implement it""" catalog = KedroDataCatalog(datasets={"test": LambdaDataset(None, None)}) result = catalog.exists("test") @@ -127,18 +127,18 @@ def test_exists_not_implemented(self, caplog): assert result is False def test_exists_invalid(self, data_catalog): - """Check the error when calling `exists` on invalid data set""" + """Check the error when calling `exists` on invalid dataset""" assert not data_catalog.exists("wrong_key") def test_release_unregistered(self, data_catalog): - """Check the error when calling `release` on unregistered data set""" + """Check the error when calling `release` on unregistered dataset""" pattern = r"Dataset \'wrong_key\' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern) as e: data_catalog.release("wrong_key") assert "did you mean" not in str(e.value) def test_release_unregistered_typo(self, data_catalog): - """Check the error when calling `release` on mistyped data set""" + """Check the error when calling `release` on mistyped dataset""" pattern = ( "Dataset 'text' not found in the catalog" " - did you mean one of these instead: test" @@ -147,7 +147,7 @@ def test_release_unregistered_typo(self, data_catalog): data_catalog.release("text") def test_multi_catalog_list(self, multi_catalog): - """Test data catalog which contains multiple data sets""" + """Test data catalog which contains multiple datasets""" entries = multi_catalog.list() assert "abc" in entries assert "xyz" in entries @@ -163,7 +163,7 @@ def test_multi_catalog_list(self, multi_catalog): ], ) def test_multi_catalog_list_regex(self, multi_catalog, pattern, expected): - """Test that regex patterns filter data sets accordingly""" + """Test that regex patterns filter datasets accordingly""" assert multi_catalog.list(regex_search=pattern) == expected def test_multi_catalog_list_bad_regex(self, multi_catalog): @@ -297,7 +297,7 @@ def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_config_missing_type(self, correct_config): - """Check the error if type attribute is missing for some data set(s) + """Check the error if type attribute is missing for some dataset(s) in the config""" del correct_config["catalog"]["boats"]["type"] pattern = ( @@ -378,13 +378,13 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " - "all data set types must extend 'AbstractDataset'" + "all dataset types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): KedroDataCatalog.from_config(**correct_config) def test_config_invalid_arguments(self, correct_config): - """Check the error if the data set config contains invalid arguments""" + """Check the error if the dataset config contains invalid arguments""" correct_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( r"Dataset 'boats' must only contain arguments valid for " @@ -416,7 +416,7 @@ def test_missing_credentials(self, correct_config): KedroDataCatalog.from_config(**correct_config) def test_link_credentials(self, correct_config, mocker): - """Test credentials being linked to the relevant data set""" + """Test credentials being linked to the relevant dataset""" mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") config = deepcopy(correct_config) del config["catalog"]["boats"] @@ -474,7 +474,7 @@ def test_idempotent_catalog(self, correct_config): assert catalog def test_error_dataset_init(self, bad_config): - """Check the error when trying to instantiate erroneous data set""" + """Check the error when trying to instantiate erroneous dataset""" pattern = r"Failed to instantiate dataset \'bad\' of type '.*BadDataset'" with pytest.raises(DatasetError, match=pattern): KedroDataCatalog.from_config(bad_config, None) @@ -530,7 +530,7 @@ def test_bad_confirm(self, correct_config, dataset_name, pattern): class TestDataCatalogVersioned: def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): - """Test load and save of versioned data sets from config""" + """Test load and save of versioned datasets from config""" correct_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. @@ -573,13 +573,13 @@ def test_from_correct_config_versioned_warn( self, caplog, correct_config, versioned ): """Check the warning if `version` attribute was added - to the data set config""" + to the dataset config""" correct_config["catalog"]["boats"]["versioned"] = versioned correct_config["catalog"]["boats"]["version"] = True KedroDataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from data set configuration since it " + "'version' attribute removed from dataset configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" @@ -600,7 +600,7 @@ def test_from_correct_config_load_versions_warn(self, correct_config): def test_compare_tracking_and_other_dataset_versioned( self, correct_config_with_tracking_ds, dummy_dataframe ): - """Test saving of tracking data sets from config results in the same + """Test saving of tracking datasets from config results in the same save version as other versioned datasets.""" catalog = KedroDataCatalog.from_config(**correct_config_with_tracking_ds) @@ -622,7 +622,7 @@ def test_compare_tracking_and_other_dataset_versioned( assert tracking_timestamp == csv_timestamp def test_load_version(self, correct_config, dummy_dataframe, mocker): - """Test load versioned data sets from config""" + """Test load versioned datasets from config""" new_dataframe = pd.DataFrame( {"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]} ) From 17199ad7afd54b75105afaf7f6b02df8c9b73f59 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 20 Sep 2024 15:02:29 +0100 Subject: [PATCH 143/173] Updated tests Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 2470c1ec60..b98e8fae83 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -75,7 +75,7 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): def test_load_error(self, data_catalog): """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from dataset CSVDataset" + pattern = r"Failed while loading data from data set CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") @@ -334,15 +334,7 @@ def test_config_import_kedro_datasets(self, correct_config, mocker): spy = mocker.spy(kedro.io.core, "_load_obj") parse_dataset_definition(correct_config["catalog"]["boats"]) for prefix, call_args in zip(_DEFAULT_PACKAGES, spy.call_args_list): - # In Python 3.7 call_args.args is not available thus we access the call - # arguments with less meaningful index. - # The 1st index returns a tuple, the 2nd index return the name of module. - assert call_args[0][0] == f"{prefix}pandas.CSVDataset" - - def test_config_import_extras(self, correct_config): - """Test kedro_datasets default path to the dataset class""" - correct_config["catalog"]["boats"]["type"] = "pandas.CSVDataset" - assert KedroDataCatalog.from_config(**correct_config) + assert call_args.args[0] == f"{prefix}pandas.CSVDataset" def test_config_missing_class(self, correct_config): """Check the error if the type points to nonexistent class""" @@ -378,7 +370,7 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " - "all dataset types must extend 'AbstractDataset'" + "all data set types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): KedroDataCatalog.from_config(**correct_config) @@ -579,7 +571,7 @@ def test_from_correct_config_versioned_warn( KedroDataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from dataset configuration since it " + "'version' attribute removed from data set configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" From 6d5f094d465092925857f1d470ab03d77243de13 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 23 Sep 2024 11:08:32 +0100 Subject: [PATCH 144/173] Made KedroDataCatalog subclass from CatalogProtocol Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 25b194736d..fecb845d60 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -17,6 +17,7 @@ from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, + CatalogProtocol, DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, @@ -27,7 +28,7 @@ from kedro.utils import _format_rich, _has_rich_handler -class KedroDataCatalog: +class KedroDataCatalog(CatalogProtocol): def __init__( self, datasets: dict[str, AbstractDataset] | None = None, From e24b2a6d44a3c0d56bab2c77c867577f745cfcd1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 23 Sep 2024 11:52:54 +0100 Subject: [PATCH 145/173] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index b44fca53c2..0655a512d6 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,8 +3,7 @@ ## Major features and improvements * Implemented `KedroDataCatalog` repeating `DataCatalog` functionality with a few API enhancements: * Removed `_FrozenDatasets` and access datasets as properties; - * Added get dataset by name feature: dedicated function and access by key; - * Added iterate over the datasets feature; + * Added get dataset by name feature; * `add_feed_dict()` was simplified and renamed to `add_data()`; * Datasets' initialisation was moved out from `from_config()` method to the constructor. * Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations. From 26f3f9900b3391b5768c5f623cec10ef6fb61554 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 23 Sep 2024 17:37:49 +0100 Subject: [PATCH 146/173] Implemented iter, getitem, setitem Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 46 +++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index fecb845d60..12ee00fbfd 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -7,7 +7,6 @@ from __future__ import annotations -import copy import difflib import logging import re @@ -72,7 +71,7 @@ def __init__( @property def datasets(self) -> dict[str, Any]: - return copy.copy(self._datasets) + return self.items() @datasets.setter def datasets(self, value: Any) -> None: @@ -100,6 +99,26 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) + def keys(self, regex_search: str | None = None) -> list[str]: + return self._filter_keys(regex_search) + + def values(self, regex_search: str | None = None) -> list[AbstractDataset]: + return [self._datasets[key] for key in self._filter_keys(regex_search)] + + def items(self, regex_search: str | None = None) -> dict[str, AbstractDataset]: + return {key: self._datasets[key] for key in self._filter_keys(regex_search)} + + def __iter__(self) -> tuple[str, AbstractDataset]: + yield from self._datasets.items() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self.get_dataset(ds_name) + + def __setitem__(self, key: str, value: Any) -> None: + if key in self._datasets: + self._logger.warning("Replacing dataset '%s'", key) + self._datasets[key] = value + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -220,14 +239,11 @@ def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset + if ds_name in self._datasets and not replace: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self.__setitem__(ds_name, ds_name) def list(self, regex_search: str | None = None) -> list[str]: """ @@ -235,14 +251,16 @@ def list(self, regex_search: str | None = None) -> list[str]: This can be filtered by providing an optional regular expression which will only return matching keys. """ + if regex_search == "": + self._logger.warning("The empty string will not match any datasets") + return [] + + return self.keys(regex_search) + def _filter_keys(self, regex_search: str | None) -> list[str]: if regex_search is None: return list(self._datasets.keys()) - if not regex_search.strip(): - self._logger.warning("The empty string will not match any datasets") - return [] - try: pattern = re.compile(regex_search, flags=re.IGNORECASE) except re.error as exc: From 5bbedfa12d273dd79188a3603f23a17a2f3190f5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 23 Sep 2024 17:57:28 +0100 Subject: [PATCH 147/173] Updated add_data and TODOs Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 12ee00fbfd..bb671c6844 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -10,7 +10,7 @@ import difflib import logging import re -from typing import Any +from typing import Any, List from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( @@ -71,10 +71,12 @@ def __init__( @property def datasets(self) -> dict[str, Any]: + # TODO: remove when removing old catalog return self.items() @datasets.setter def datasets(self, value: Any) -> None: + # TODO: remove when removing old catalog raise AttributeError( "Operation not allowed. Please use KedroDataCatalog.add() instead." ) @@ -99,10 +101,10 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) - def keys(self, regex_search: str | None = None) -> list[str]: + def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 return self._filter_keys(regex_search) - def values(self, regex_search: str | None = None) -> list[AbstractDataset]: + def values(self, regex_search: str | None = None) -> List[AbstractDataset]: # noqa: UP006 return [self._datasets[key] for key in self._filter_keys(regex_search)] def items(self, regex_search: str | None = None) -> dict[str, AbstractDataset]: @@ -117,7 +119,10 @@ def __getitem__(self, ds_name: str) -> AbstractDataset: def __setitem__(self, key: str, value: Any) -> None: if key in self._datasets: self._logger.warning("Replacing dataset '%s'", key) - self._datasets[key] = value + if isinstance(value, AbstractDataset): + self._datasets[key] = value + else: + self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] @property def _logger(self) -> logging.Logger: @@ -238,26 +243,28 @@ def _get_dataset( def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: + # TODO: remove when removing old catalog """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" if ds_name in self._datasets and not replace: raise DatasetAlreadyExistsError( f"Dataset '{ds_name}' has already been registered" ) - self.__setitem__(ds_name, ds_name) + self.__setitem__(ds_name, dataset) - def list(self, regex_search: str | None = None) -> list[str]: + def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 """ List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression which will only return matching keys. """ + # TODO: remove when removing old catalog if regex_search == "": self._logger.warning("The empty string will not match any datasets") return [] return self.keys(regex_search) - def _filter_keys(self, regex_search: str | None) -> list[str]: + def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 if regex_search is None: return list(self._datasets.keys()) @@ -323,10 +330,13 @@ def confirm(self, name: str) -> None: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def add_data(self, data: dict[str, Any], replace: bool = False) -> None: - # This method was simplified to add memory datasets only, since - # adding AbstractDataset can be done via add() method + # TODO: remove when removing old catalog for ds_name, ds_data in data.items(): - self.add(ds_name, MemoryDataset(data=ds_data), replace) # type: ignore[abstract] + if ds_name in self._datasets and not replace: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self.__setitem__(ds_name, ds_data) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog From 5ca6b48df405cb27dc7ff9634a56063483aaca1b Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 24 Sep 2024 10:08:43 +0100 Subject: [PATCH 148/173] Added key completions Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index bb671c6844..d9f124857f 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -124,6 +124,9 @@ def __setitem__(self, key: str, value: Any) -> None: else: self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From 3914ccad3770d64b5c2a41e39e3940b4a39ad1a4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 24 Sep 2024 11:02:47 +0100 Subject: [PATCH 149/173] Maded behavior dict like Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index d9f124857f..2aba230908 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -7,6 +7,7 @@ from __future__ import annotations +import copy import difflib import logging import re @@ -72,7 +73,7 @@ def __init__( @property def datasets(self) -> dict[str, Any]: # TODO: remove when removing old catalog - return self.items() + return copy.copy(self._datasets) @datasets.setter def datasets(self, value: Any) -> None: @@ -107,11 +108,13 @@ def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 def values(self, regex_search: str | None = None) -> List[AbstractDataset]: # noqa: UP006 return [self._datasets[key] for key in self._filter_keys(regex_search)] - def items(self, regex_search: str | None = None) -> dict[str, AbstractDataset]: - return {key: self._datasets[key] for key in self._filter_keys(regex_search)} + def items( + self, regex_search: str | None = None + ) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 + return [(key, self._datasets[key]) for key in self._filter_keys(regex_search)] - def __iter__(self) -> tuple[str, AbstractDataset]: - yield from self._datasets.items() + def __iter__(self) -> str: + yield from self._datasets.keys() def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get_dataset(ds_name) From 643219d92400bc34fdcec1081609b751d88afe8e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 27 Sep 2024 11:12:38 +0100 Subject: [PATCH 150/173] Merged with main Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 68 +++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index ce06e34aac..709ea6c06a 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -11,7 +11,7 @@ import difflib import logging import re -from typing import Any +from typing import Any, List from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( @@ -72,10 +72,12 @@ def __init__( @property def datasets(self) -> dict[str, Any]: + # TODO: remove when removing old catalog return copy.copy(self._datasets) @datasets.setter def datasets(self, value: Any) -> None: + # TODO: remove when removing old catalog raise AttributeError( "Operation not allowed. Please use KedroDataCatalog.add() instead." ) @@ -100,6 +102,34 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) + def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 + return self._filter_keys(regex_search) + + def values(self, regex_search: str | None = None) -> List[AbstractDataset]: # noqa: UP006 + return [self._datasets[key] for key in self._filter_keys(regex_search)] + + def items( + self, regex_search: str | None = None + ) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 + return [(key, self._datasets[key]) for key in self._filter_keys(regex_search)] + + def __iter__(self) -> str: + yield from self._datasets.keys() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self.get_dataset(ds_name) + + def __setitem__(self, key: str, value: Any) -> None: + if key in self._datasets: + self._logger.warning("Replacing dataset '%s'", key) + if isinstance(value, AbstractDataset): + self._datasets[key] = value + else: + self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] + + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -219,30 +249,31 @@ def _get_dataset( def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: + # TODO: remove when removing old catalog """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset + if ds_name in self._datasets and not replace: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self.__setitem__(ds_name, dataset) - def list(self, regex_search: str | None = None) -> list[str]: + def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 """ List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression which will only return matching keys. """ + # TODO: remove when removing old catalog + if regex_search == "": + self._logger.warning("The empty string will not match any datasets") + return [] + + return self.keys(regex_search) + def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 if regex_search is None: return list(self._datasets.keys()) - if not regex_search.strip(): - self._logger.warning("The empty string will not match any datasets") - return [] - try: pattern = re.compile(regex_search, flags=re.IGNORECASE) except re.error as exc: @@ -305,10 +336,13 @@ def confirm(self, name: str) -> None: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def add_data(self, data: dict[str, Any], replace: bool = False) -> None: - # This method was simplified to add memory datasets only, since - # adding AbstractDataset can be done via add() method + # TODO: remove when removing old catalog for ds_name, ds_data in data.items(): - self.add(ds_name, MemoryDataset(data=ds_data), replace) # type: ignore[abstract] + if ds_name in self._datasets and not replace: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self.__setitem__(ds_name, ds_data) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: # TODO: remove when removing old catalog From b4ae279c0467bf633006bfe059b936e5a6b34763 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 7 Oct 2024 18:33:28 +0100 Subject: [PATCH 151/173] Removed add_data() method Signed-off-by: Elena Khaustova --- RELEASE.md | 2 +- kedro/io/kedro_data_catalog.py | 11 ++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 61560acf87..c9a871b0e0 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -4,7 +4,7 @@ * Implemented `KedroDataCatalog` repeating `DataCatalog` functionality with a few API enhancements: * Removed `_FrozenDatasets` and access datasets as properties; * Added get dataset by name feature; - * `add_feed_dict()` was simplified and renamed to `add_data()`; + * `add_feed_dict()` was simplified to only add raw data; * Datasets' initialisation was moved out from `from_config()` method to the constructor. * Moved development requirements from `requirements.txt` to the dedicated section in `pyproject.toml` for project template. * Implemented `Protocol` abstraction for the current `DataCatalog` and adding new catalog implementations. diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index ce06e34aac..882eb19cf8 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -68,7 +68,7 @@ def __init__( self._add_from_config(ds_name, ds_config) if raw_data: - self.add_data(raw_data) + self.add_feed_dict(raw_data) @property def datasets(self) -> dict[str, Any]: @@ -304,16 +304,13 @@ def confirm(self, name: str) -> None: else: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - def add_data(self, data: dict[str, Any], replace: bool = False) -> None: + def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: + # TODO: remove when removing old catalog # This method was simplified to add memory datasets only, since # adding AbstractDataset can be done via add() method - for ds_name, ds_data in data.items(): + for ds_name, ds_data in feed_dict.items(): self.add(ds_name, MemoryDataset(data=ds_data), replace) # type: ignore[abstract] - def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: - # TODO: remove when removing old catalog - return self.add_data(feed_dict, replace) - def shallow_copy( self, extra_dataset_patterns: Patterns | None = None ) -> KedroDataCatalog: From 5bdf16b4254be1e4db277d8955de98f7bc52f6dc Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 7 Oct 2024 18:49:08 +0100 Subject: [PATCH 152/173] Added usage example and updated docstrings with experimental feature note Signed-off-by: Elena Khaustova --- RELEASE.md | 6 ++++-- kedro/io/kedro_data_catalog.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index c9a871b0e0..e80e7f112d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -12,12 +12,14 @@ * Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally. * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment. * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking. + +**Note:** ``KedroDataCatalog`` is an experimental feature, so please mind possible breaking changes while using it. + ## Bug fixes and other changes * Fixed bug where using dataset factories breaks with `ThreadRunner`. * Fixed a bug where `SharedMemoryDataset.exists` would not call the underlying `MemoryDataset`. * Fixed template projects example tests. -* Made credentials loading consistent between `KedroContext._get_catalog()` and `resolve_patterns` so that both us -e `_get_config_credentials()` +* Made credentials loading consistent between `KedroContext._get_catalog()` and `resolve_patterns` so that both use `_get_config_credentials()` ## Breaking changes to the API * Removed `ShelveStore` to address a security vulnerability. diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 882eb19cf8..f9cb40ae4c 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -3,6 +3,9 @@ use a ``KedroDataCatalog``, you need to instantiate it with a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save functions to the underlying datasets. + +``KedroDataCatalog`` is an experimental feature aimed to replace ``DataCatalog`` in the future. +Expect possible breaking changes while using it. """ from __future__ import annotations @@ -44,6 +47,9 @@ def __init__( single point of reference for your calls, relaying load and save functions to the underlying datasets. + Note: ``KedroDataCatalog`` is an experimental feature, so please mind + possible breaking changes while using it. + Args: datasets: A dictionary of dataset names and dataset instances. raw_data: A dictionary with data to be added in memory as `MemoryDataset`` instances. @@ -56,6 +62,13 @@ def __init__( case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. + + Example: + :: + >>> # settings.py + >>> from kedro.io import KedroDataCatalog + >>> + >>> DATA_CATALOG_CLASS = KedroDataCatalog """ self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets = datasets or {} From 8ea366748b29e693b9ce0db3b1d1e45130980947 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 8 Oct 2024 13:31:59 +0100 Subject: [PATCH 153/173] Added len and get Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 35 +++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index b2ce312d77..e1c8d1d0ce 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -140,6 +140,22 @@ def __setitem__(self, key: str, value: Any) -> None: else: self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] + def __len__(self) -> int: + return len(self.keys()) + + def get( + self, key: str, default: AbstractDataset | None = None + ) -> AbstractDataset | None: + """Get a dataset by name from an internal collection of datasets.""" + if key not in self._datasets: + ds_config = self._config_resolver.resolve_pattern(key) + if ds_config: + self._add_from_config(key, ds_config) + + dataset = self._datasets.get(key, None) + + return dataset or default + def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) @@ -209,6 +225,7 @@ def _add_from_config(self, ds_name: str, ds_config: dict[str, Any]) -> None: def get_dataset( self, ds_name: str, version: Version | None = None, suggest: bool = True ) -> AbstractDataset: + # TODO: remove when removing old catalog """Get a dataset by name from an internal collection of datasets. If a dataset is not in the collection but matches any pattern @@ -228,12 +245,7 @@ def get_dataset( DatasetNotFoundError: When a dataset with the given name is not in the collection and do not match patterns. """ - if ds_name not in self._datasets: - ds_config = self._config_resolver.resolve_pattern(ds_name) - if ds_config: - self._add_from_config(ds_name, ds_config) - - dataset = self._datasets.get(ds_name, None) + dataset = self.get(ds_name) if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" @@ -271,12 +283,12 @@ def add( self.__setitem__(ds_name, dataset) def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 + # TODO: remove when removing old catalog """ List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression which will only return matching keys. """ - # TODO: remove when removing old catalog if regex_search == "": self._logger.warning("The empty string will not match any datasets") return [] @@ -296,6 +308,11 @@ def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: + # TODO: remove when removing old catalog + """Save data to a registered dataset.""" + self.save_data(name, data) + + def save_data(self, name: str, data: Any) -> None: """Save data to a registered dataset.""" dataset = self.get_dataset(name) @@ -309,6 +326,10 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def load(self, name: str, version: str | None = None) -> Any: + # TODO: remove when removing old catalog + return self.load_data(name, version) + + def load_data(self, name: str, version: str | None = None) -> Any: """Loads a registered dataset.""" load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) From 8fd70436cdadca2e52961845145730997f83d10b Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 8 Oct 2024 14:42:37 +0100 Subject: [PATCH 154/173] Implemented unit tests Signed-off-by: Elena Khaustova --- tests/io/test_kedro_data_catalog.py | 35 ++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index b98e8fae83..a52d91cd7e 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -397,7 +397,7 @@ def test_config_invalid_dataset_config(self, correct_config): def test_empty_config(self): """Test empty config""" - assert KedroDataCatalog.from_config(None) + assert len(KedroDataCatalog.from_config(None)) == 0 def test_missing_credentials(self, correct_config): """Check the error if credentials can't be located""" @@ -520,6 +520,39 @@ def test_bad_confirm(self, correct_config, dataset_name, pattern): with pytest.raises(DatasetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name) + def test_iteration(self, correct_config): + """Test iterate through keys, values and items.""" + data_catalog = KedroDataCatalog.from_config(**correct_config) + + for ds_name_cat, ds_name_config in zip( + data_catalog, correct_config["catalog"] + ): + assert ds_name_cat == ds_name_config + + for ds_name_cat, ds_name_config in zip( + data_catalog.keys(), correct_config["catalog"] + ): + assert ds_name_cat == ds_name_config + + for ds in data_catalog.values(): + assert isinstance(ds, CSVDataset) + + for ds_name, ds in data_catalog.items(): + assert isinstance(ds, CSVDataset) + assert ds_name in correct_config["catalog"] + + def test_getitem_setitem(self, correct_config): + """Test get and set item.""" + data_catalog = KedroDataCatalog.from_config(**correct_config) + data_catalog["test"] = 123 + assert isinstance(data_catalog["test"], MemoryDataset) + + def test_ipython_key_completions(self, correct_config): + data_catalog = KedroDataCatalog.from_config(**correct_config) + assert data_catalog._ipython_key_completions_() == list( + correct_config["catalog"].keys() + ) + class TestDataCatalogVersioned: def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): """Test load and save of versioned datasets from config""" From a50fbc9af5fe7302b6cc60f967195b03647de935 Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:46:20 +0100 Subject: [PATCH 155/173] Update RELEASE.md Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index e80e7f112d..19621ea499 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,7 +13,7 @@ * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment. * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking. -**Note:** ``KedroDataCatalog`` is an experimental feature, so please mind possible breaking changes while using it. +**Note:** ``KedroDataCatalog`` is an experimental feature and is under active development. Therefore, it is possible we'll introduce breaking changes to this class, so be mindful of that if you decide to use it already. Let us know if you have any feedback about the ``KedroDataCatalog`` or ideas for new features. ## Bug fixes and other changes * Fixed bug where using dataset factories breaks with `ThreadRunner`. From 5b02d05de08b80b8aa5772116c3beada2b8598be Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:46:36 +0100 Subject: [PATCH 156/173] Update kedro/io/kedro_data_catalog.py Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> --- kedro/io/kedro_data_catalog.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index f9cb40ae4c..d07de8151a 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -47,8 +47,7 @@ def __init__( single point of reference for your calls, relaying load and save functions to the underlying datasets. - Note: ``KedroDataCatalog`` is an experimental feature, so please mind - possible breaking changes while using it. + Note: ``KedroDataCatalog`` is an experimental feature and is under active development. Therefore, it is possible we'll introduce breaking changes to this class, so be mindful of that if you decide to use it already. Args: datasets: A dictionary of dataset names and dataset instances. From 1223f2679af0cdef0924ad88374ce4305ca71baf Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 8 Oct 2024 15:38:22 +0100 Subject: [PATCH 157/173] Fixed lint Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 4038e3dfa0..e7645620ba 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -14,7 +14,7 @@ import difflib import logging import re -from typing import Any, List +from typing import Any, List # noqa: UP035 from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( From 02e2c5e380c4077dab38f203a7a7118812a04c08 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 9 Oct 2024 00:05:12 +0100 Subject: [PATCH 158/173] Updated load_data and save_data to use new interface Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 35 ++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index e7645620ba..6cc7252d67 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -128,8 +128,8 @@ def items( def __iter__(self) -> str: yield from self._datasets.keys() - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self.get_dataset(ds_name) + def __getitem__(self, ds_name: str) -> AbstractDataset | None: + return self.get(ds_name) def __setitem__(self, key: str, value: Any) -> None: if key in self._datasets: @@ -306,6 +306,22 @@ def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + def _validate_dataset(self, ds_name: str, ds: AbstractDataset | None) -> None: + """Validates if dataset is not None and suggests fuzzy-matching datasets' names + in the DatasetNotFoundError message otherwise. + + Raises: + DatasetNotFoundError: When a dataset with the given name + is not in the collection and do not match patterns. + """ + if ds is None: + error_msg = f"Dataset '{ds_name}' not found in the catalog" + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) + if matches: + suggestions = ", ".join(matches) + error_msg += f" - did you mean one of these instead: {suggestions}" + raise DatasetNotFoundError(error_msg) + def save(self, name: str, data: Any) -> None: # TODO: remove when removing old catalog """Save data to a registered dataset.""" @@ -313,7 +329,8 @@ def save(self, name: str, data: Any) -> None: def save_data(self, name: str, data: Any) -> None: """Save data to a registered dataset.""" - dataset = self.get_dataset(name) + dataset = self.get(name) + self._validate_dataset(name, dataset) self._logger.info( "Saving data to %s (%s)...", @@ -322,7 +339,7 @@ def save_data(self, name: str, data: Any) -> None: extra={"markup": True}, ) - dataset.save(data) + dataset.save(data) # type: ignore[union-attr] def load(self, name: str, version: str | None = None) -> Any: # TODO: remove when removing old catalog @@ -330,8 +347,14 @@ def load(self, name: str, version: str | None = None) -> Any: def load_data(self, name: str, version: str | None = None) -> Any: """Loads a registered dataset.""" + dataset = self.get(name) + self._validate_dataset(name, dataset) + load_version = Version(version, None) if version else None - dataset = self.get_dataset(name, version=load_version) + if load_version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=load_version) self._logger.info( "Loading data from %s (%s)...", @@ -340,7 +363,7 @@ def load_data(self, name: str, version: str | None = None) -> Any: extra={"markup": True}, ) - return dataset.load() + return dataset.load() # type: ignore[union-attr] def release(self, name: str) -> None: """Release any cached data associated with a dataset From 517d770224582951226fbe15a6d75c6a62dae8df Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 9 Oct 2024 00:05:59 +0100 Subject: [PATCH 159/173] Updated load_data and save_data to use new interface Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 6cc7252d67..3b225342b7 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -308,7 +308,7 @@ def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 def _validate_dataset(self, ds_name: str, ds: AbstractDataset | None) -> None: """Validates if dataset is not None and suggests fuzzy-matching datasets' names - in the DatasetNotFoundError message otherwise. + in the DatasetNotFoundError message otherwise. Raises: DatasetNotFoundError: When a dataset with the given name From aa95229267166a88ecde09a9c7f1b9d1b343ad14 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 9 Oct 2024 00:24:51 +0100 Subject: [PATCH 160/173] Returned usage of get_dataset() Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 3b225342b7..204327b297 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -128,7 +128,7 @@ def items( def __iter__(self) -> str: yield from self._datasets.keys() - def __getitem__(self, ds_name: str) -> AbstractDataset | None: + def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get(ds_name) def __setitem__(self, key: str, value: Any) -> None: @@ -306,22 +306,6 @@ def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] - def _validate_dataset(self, ds_name: str, ds: AbstractDataset | None) -> None: - """Validates if dataset is not None and suggests fuzzy-matching datasets' names - in the DatasetNotFoundError message otherwise. - - Raises: - DatasetNotFoundError: When a dataset with the given name - is not in the collection and do not match patterns. - """ - if ds is None: - error_msg = f"Dataset '{ds_name}' not found in the catalog" - matches = difflib.get_close_matches(ds_name, self._datasets.keys()) - if matches: - suggestions = ", ".join(matches) - error_msg += f" - did you mean one of these instead: {suggestions}" - raise DatasetNotFoundError(error_msg) - def save(self, name: str, data: Any) -> None: # TODO: remove when removing old catalog """Save data to a registered dataset.""" @@ -329,8 +313,7 @@ def save(self, name: str, data: Any) -> None: def save_data(self, name: str, data: Any) -> None: """Save data to a registered dataset.""" - dataset = self.get(name) - self._validate_dataset(name, dataset) + dataset = self.get_dataset(name) self._logger.info( "Saving data to %s (%s)...", @@ -339,7 +322,7 @@ def save_data(self, name: str, data: Any) -> None: extra={"markup": True}, ) - dataset.save(data) # type: ignore[union-attr] + dataset.save(data) def load(self, name: str, version: str | None = None) -> Any: # TODO: remove when removing old catalog @@ -347,14 +330,8 @@ def load(self, name: str, version: str | None = None) -> Any: def load_data(self, name: str, version: str | None = None) -> Any: """Loads a registered dataset.""" - dataset = self.get(name) - self._validate_dataset(name, dataset) - load_version = Version(version, None) if version else None - if load_version and isinstance(dataset, AbstractVersionedDataset): - # we only want to return a similar-looking dataset, - # not modify the one stored in the current catalog - dataset = dataset._copy(_version=load_version) + dataset = self.get_dataset(name, version=load_version) self._logger.info( "Loading data from %s (%s)...", @@ -363,7 +340,7 @@ def load_data(self, name: str, version: str | None = None) -> Any: extra={"markup": True}, ) - return dataset.load() # type: ignore[union-attr] + return dataset.load() def release(self, name: str) -> None: """Release any cached data associated with a dataset From 494b4b97ffdc7ec0b370ef062021ad5f7013fa33 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 9 Oct 2024 00:34:01 +0100 Subject: [PATCH 161/173] Fixed lint Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 204327b297..33bcf0d29c 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -128,7 +128,7 @@ def items( def __iter__(self) -> str: yield from self._datasets.keys() - def __getitem__(self, ds_name: str) -> AbstractDataset: + def __getitem__(self, ds_name: str) -> AbstractDataset | None: return self.get(ds_name) def __setitem__(self, key: str, value: Any) -> None: From 35e10f3269bcebe6996185500e015340e403f71d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 14 Oct 2024 11:20:48 +0100 Subject: [PATCH 162/173] Updated __getitem__ to use old get_dataset() method Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 33bcf0d29c..01bfa117af 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -129,7 +129,7 @@ def __iter__(self) -> str: yield from self._datasets.keys() def __getitem__(self, ds_name: str) -> AbstractDataset | None: - return self.get(ds_name) + return self.get_dataset(ds_name) def __setitem__(self, key: str, value: Any) -> None: if key in self._datasets: From 9d52ecf6003ca214b95cc6b2d196c3383cd0d53d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 15 Oct 2024 22:57:37 +0100 Subject: [PATCH 163/173] Removed regex_search from values() Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 01bfa117af..7372c0d31d 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -117,8 +117,8 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 return self._filter_keys(regex_search) - def values(self, regex_search: str | None = None) -> List[AbstractDataset]: # noqa: UP006 - return [self._datasets[key] for key in self._filter_keys(regex_search)] + def values(self) -> List[AbstractDataset]: # noqa: UP006 + return [self._datasets[key] for key in self] def items( self, regex_search: str | None = None From dac141dce453cda54d154fcb41a4d937f7f50046 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 15 Oct 2024 23:09:47 +0100 Subject: [PATCH 164/173] Fixed type annotation for __iter__ Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 7372c0d31d..b3d3958d8d 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -14,7 +14,7 @@ import difflib import logging import re -from typing import Any, List # noqa: UP035 +from typing import Any, Iterable, List # noqa: UP035 from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( @@ -125,7 +125,7 @@ def items( ) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 return [(key, self._datasets[key]) for key in self._filter_keys(regex_search)] - def __iter__(self) -> str: + def __iter__(self) -> Iterable[str]: yield from self._datasets.keys() def __getitem__(self, ds_name: str) -> AbstractDataset | None: From 6b3eb9e98544a7a7d5f2949c273bd9b5af4e40ba Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 15 Oct 2024 23:18:28 +0100 Subject: [PATCH 165/173] Fixed linter Signed-off-by: Elena Khaustova --- kedro/framework/cli/utils.py | 2 +- kedro/framework/context/context.py | 2 +- kedro/io/kedro_data_catalog.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index ca2acfab31..1b50408cc5 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -422,7 +422,7 @@ def find_run_command(package_name: str) -> Callable: # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run - return run # type: ignore[return-value] + return run # type: ignore[no-any-return] # fail badly if cli.py exists, but has no `cli` in it if not hasattr(project_cli, "cli"): raise KedroCliError(f"Cannot load commands from {package_name}.cli") diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 0b44056374..5c14cbae38 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -207,7 +207,7 @@ def params(self) -> dict[str, Any]: # Merge nested structures params = OmegaConf.merge(params, self._extra_params) - return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[return-value] + return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[no-any-return] def _get_catalog( self, diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index b3d3958d8d..3aac00f49a 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -118,7 +118,7 @@ def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 return self._filter_keys(regex_search) def values(self) -> List[AbstractDataset]: # noqa: UP006 - return [self._datasets[key] for key in self] + return [self._datasets[key] for key in self.__iter__()] def items( self, regex_search: str | None = None From 3edb4fb5b74c1c7717099f8981fe000eb108dacb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 15 Oct 2024 23:23:10 +0100 Subject: [PATCH 166/173] Revert lint fix Signed-off-by: Elena Khaustova --- kedro/framework/cli/utils.py | 2 +- kedro/framework/context/context.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index 1b50408cc5..ca2acfab31 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -422,7 +422,7 @@ def find_run_command(package_name: str) -> Callable: # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run - return run # type: ignore[no-any-return] + return run # type: ignore[return-value] # fail badly if cli.py exists, but has no `cli` in it if not hasattr(project_cli, "cli"): raise KedroCliError(f"Cannot load commands from {package_name}.cli") diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 5c14cbae38..0b44056374 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -207,7 +207,7 @@ def params(self) -> dict[str, Any]: # Merge nested structures params = OmegaConf.merge(params, self._extra_params) - return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[no-any-return] + return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[return-value] def _get_catalog( self, From 78af7ebb9cd92c32e43ac895b9e9e402408ce6bb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 16 Oct 2024 15:07:00 +0100 Subject: [PATCH 167/173] Returned short names for save and load Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 65 +++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 3aac00f49a..c17b1fe791 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -307,12 +307,35 @@ def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: - # TODO: remove when removing old catalog - """Save data to a registered dataset.""" - self.save_data(name, data) + # TODO: rename input argument when breaking change: name -> ds_name + """Save data to a registered dataset. + + Args: + name: A dataset to be saved to. + data: A data object to be saved as configured in the registered + dataset. + + Raises: + DatasetNotFoundError: When a dataset with the given name + has not yet been registered. - def save_data(self, name: str, data: Any) -> None: - """Save data to a registered dataset.""" + Example: + :: + + >>> import pandas as pd + >>> + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = pd.DataFrame({'col1': [1, 2], + >>> 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> catalog.save("cars", df) + """ dataset = self.get_dataset(name) self._logger.info( @@ -325,11 +348,35 @@ def save_data(self, name: str, data: Any) -> None: dataset.save(data) def load(self, name: str, version: str | None = None) -> Any: - # TODO: remove when removing old catalog - return self.load_data(name, version) + # TODO: rename input argument when breaking change: name -> ds_name + # TODO: remove version from input arguments when breaking change + """Loads a registered dataset. - def load_data(self, name: str, version: str | None = None) -> Any: - """Loads a registered dataset.""" + Args: + name: A dataset to be loaded. + version: Optional argument for concrete data version to be loaded. + Works only with versioned datasets. + + Returns: + The loaded data as configured. + + Raises: + DatasetNotFoundError: When a dataset with the given name + has not yet been registered. + + Example: + :: + + >>> from kedro.io import DataCatalog + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = catalog.load("cars") + """ load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) From 435bea16bb88bfbf025eb4377314d408a66466c2 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 16 Oct 2024 19:00:48 +0100 Subject: [PATCH 168/173] Removed regex_search from keys and items Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index c17b1fe791..0568184f0e 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -114,21 +114,19 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) - def keys(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 - return self._filter_keys(regex_search) + def keys(self) -> List[str]: # noqa: UP006 + return list(self.__iter__()) def values(self) -> List[AbstractDataset]: # noqa: UP006 return [self._datasets[key] for key in self.__iter__()] - def items( - self, regex_search: str | None = None - ) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 - return [(key, self._datasets[key]) for key in self._filter_keys(regex_search)] + def items(self) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 + return [(key, self._datasets[key]) for key in self.__iter__()] def __iter__(self) -> Iterable[str]: yield from self._datasets.keys() - def __getitem__(self, ds_name: str) -> AbstractDataset | None: + def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get_dataset(ds_name) def __setitem__(self, key: str, value: Any) -> None: @@ -137,6 +135,7 @@ def __setitem__(self, key: str, value: Any) -> None: if isinstance(value, AbstractDataset): self._datasets[key] = value else: + self._logger.info(f"Adding input data as a MemoryDataset - {key}") self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] def __len__(self) -> int: @@ -292,19 +291,26 @@ def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 self._logger.warning("The empty string will not match any datasets") return [] - return self.keys(regex_search) + return self.filter(regex_search) - def _filter_keys(self, regex_search: str | None) -> List[str]: # noqa: UP006 + def filter( + self, regex_search: str | None, regex_flags: int | re.RegexFlag | None = None + ) -> List[str]: # noqa: UP006 + """ + Filter dataset names registered in the catalog. + """ if regex_search is None: - return list(self._datasets.keys()) + return self.keys() + if regex_flags is None: + regex_flags = re.IGNORECASE try: - pattern = re.compile(regex_search, flags=re.IGNORECASE) + pattern = re.compile(regex_search, flags=regex_flags) except re.error as exc: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + return [ds_name for ds_name in self.__iter__() if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: # TODO: rename input argument when breaking change: name -> ds_name From 08fa0196b3f463b640c1680a5c7902305d9f127f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 16 Oct 2024 19:04:15 +0100 Subject: [PATCH 169/173] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index a5e34a6ba8..f4b10035b7 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,10 @@ # Upcoming Release ## Major features and improvements +* Implemented dict-like interface for `KedroDataCatalog`. + +**Note:** ``KedroDataCatalog`` is an experimental feature and is under active development. Therefore, it is possible we'll introduce breaking changes to this class, so be mindful of that if you decide to use it already. Let us know if you have any feedback about the ``KedroDataCatalog`` or ideas for new features. + ## Bug fixes and other changes ## Breaking changes to the API ## Documentation changes From 6de5fbffacd605e2abb77768435d2935363df41e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 16 Oct 2024 19:17:59 +0100 Subject: [PATCH 170/173] Maded regex_search non optional Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 0568184f0e..769ce39e98 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -287,6 +287,9 @@ def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 This can be filtered by providing an optional regular expression which will only return matching keys. """ + if regex_search is None: + return self.keys() + if regex_search == "": self._logger.warning("The empty string will not match any datasets") return [] @@ -294,13 +297,11 @@ def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 return self.filter(regex_search) def filter( - self, regex_search: str | None, regex_flags: int | re.RegexFlag | None = None + self, regex_search: str, regex_flags: int | re.RegexFlag | None = None ) -> List[str]: # noqa: UP006 """ Filter dataset names registered in the catalog. """ - if regex_search is None: - return self.keys() if regex_flags is None: regex_flags = re.IGNORECASE From 23f35246bf16ba86625d2f2abbec8772c4d0b4bc Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 16 Oct 2024 19:20:18 +0100 Subject: [PATCH 171/173] Changed default for regex_flags Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 769ce39e98..7acb8545f2 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -297,12 +297,12 @@ def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 return self.filter(regex_search) def filter( - self, regex_search: str, regex_flags: int | re.RegexFlag | None = None + self, regex_search: str, regex_flags: int | re.RegexFlag = 0 ) -> List[str]: # noqa: UP006 """ Filter dataset names registered in the catalog. """ - if regex_flags is None: + if not regex_flags: regex_flags = re.IGNORECASE try: From c93aabb0c81eee93a9d5230bc316bc18bc860ce6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 18 Oct 2024 15:59:26 +0100 Subject: [PATCH 172/173] Returned list() method Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 7acb8545f2..87dc4c0fa0 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -280,8 +280,10 @@ def add( ) self.__setitem__(ds_name, dataset) - def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 - # TODO: remove when removing old catalog + def list( + self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0 + ) -> List[str]: # noqa: UP006 + # TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917 """ List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression @@ -294,14 +296,6 @@ def list(self, regex_search: str | None = None) -> List[str]: # noqa: UP006 self._logger.warning("The empty string will not match any datasets") return [] - return self.filter(regex_search) - - def filter( - self, regex_search: str, regex_flags: int | re.RegexFlag = 0 - ) -> List[str]: # noqa: UP006 - """ - Filter dataset names registered in the catalog. - """ if not regex_flags: regex_flags = re.IGNORECASE From 6650a83038e90cae5da7f78f0a84eccadd8a2ddb Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 18 Oct 2024 16:47:08 +0100 Subject: [PATCH 173/173] Fixed __iter__ return type Signed-off-by: Elena Khaustova --- kedro/io/kedro_data_catalog.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index 87dc4c0fa0..c3d216abcd 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -14,7 +14,7 @@ import difflib import logging import re -from typing import Any, Iterable, List # noqa: UP035 +from typing import Any, Iterator, List # noqa: UP035 from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( @@ -118,12 +118,12 @@ def keys(self) -> List[str]: # noqa: UP006 return list(self.__iter__()) def values(self) -> List[AbstractDataset]: # noqa: UP006 - return [self._datasets[key] for key in self.__iter__()] + return [self._datasets[key] for key in self] def items(self) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 - return [(key, self._datasets[key]) for key in self.__iter__()] + return [(key, self._datasets[key]) for key in self] - def __iter__(self) -> Iterable[str]: + def __iter__(self) -> Iterator[str]: yield from self._datasets.keys() def __getitem__(self, ds_name: str) -> AbstractDataset: