From 3fe61a0f9ede30849c2935b2a42231db1674822b Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Fri, 18 Oct 2024 18:11:51 +0100 Subject: [PATCH] [DataCatalog2.0]: `KedroDataCatalog` with dict interface (#4218) * Added a skeleton for AbstractDataCatalog and KedroDataCatalog Signed-off-by: Elena Khaustova * Removed from_config method Signed-off-by: Elena Khaustova * Implemented _init_datasets method Signed-off-by: Elena Khaustova * Implemented get dataset Signed-off-by: Elena Khaustova * Started resolve_patterns implementation Signed-off-by: Elena Khaustova * Implemented resolve_patterns Signed-off-by: Elena Khaustova * Fixed credentials resolving Signed-off-by: Elena Khaustova * Updated match pattern Signed-off-by: Elena Khaustova * Implemented add from dict method Signed-off-by: Elena Khaustova * Updated io __init__ Signed-off-by: Elena Khaustova * Added list method Signed-off-by: Elena Khaustova * Implemented _validate_missing_keys Signed-off-by: Elena Khaustova * Added datasets access logic Signed-off-by: Elena Khaustova * Added __contains__ and comments on lazy loading Signed-off-by: Elena Khaustova * Renamed dataset_name to ds_name Signed-off-by: Elena Khaustova * Updated some docstrings Signed-off-by: Elena Khaustova * Fixed _update_ds_configs Signed-off-by: Elena Khaustova * Fixed _init_datasets Signed-off-by: Elena Khaustova * Implemented add_runtime_patterns Signed-off-by: Elena Khaustova * Fixed runtime patterns usage Signed-off-by: Elena Khaustova * Moved pattern logic out of data catalog, implemented KedroDataCatalog Signed-off-by: Elena Khaustova * KedroDataCatalog updates Signed-off-by: Elena Khaustova * Added property to return config Signed-off-by: Elena Khaustova * Added list patterns method Signed-off-by: Elena Khaustova * Renamed and moved ConfigResolver Signed-off-by: Elena Khaustova * Renamed ConfigResolver Signed-off-by: Elena Khaustova * Cleaned KedroDataCatalog Signed-off-by: Elena Khaustova * Cleaned up DataCatalogConfigResolver Signed-off-by: Elena Khaustova * Docs build fix attempt Signed-off-by: Elena Khaustova * KedroDataCatalog draft Signed-off-by: Elena Khaustova * Removed KedroDataCatalog Signed-off-by: Elena Khaustova * Updated from_config method Signed-off-by: Elena Khaustova * Updated constructor and add methods Signed-off-by: Elena Khaustova * Updated _get_dataset method Signed-off-by: Elena Khaustova * Updated __contains__ Signed-off-by: Elena Khaustova * Updated __eq__ and shallow_copy Signed-off-by: Elena Khaustova * Added __iter__ and __getitem__ Signed-off-by: Elena Khaustova * Removed unused imports Signed-off-by: Elena Khaustova * Added TODO Signed-off-by: Elena Khaustova * Updated runner.run() Signed-off-by: Elena Khaustova * Updated session Signed-off-by: Elena Khaustova * Added confil_resolver property Signed-off-by: Elena Khaustova * Updated catalog list command Signed-off-by: Elena Khaustova * Updated catalog create command Signed-off-by: Elena Khaustova * Updated catalog rank command Signed-off-by: Elena Khaustova * Updated catalog resolve command Signed-off-by: Elena Khaustova * Remove some methods Signed-off-by: Elena Khaustova * Removed ds configs from catalog Signed-off-by: Elena Khaustova * Fixed lint Signed-off-by: Elena Khaustova * Fixed typo Signed-off-by: Elena Khaustova * Added module docstring Signed-off-by: Elena Khaustova * Renaming methods Signed-off-by: Elena Khaustova * Removed None from Pattern type Signed-off-by: Elena Khaustova * Fixed docs failing to find class reference Signed-off-by: Elena Khaustova * Fixed docs failing to find class reference Signed-off-by: Elena Khaustova * Updated Patterns type Signed-off-by: Elena Khaustova * Fix tests (#4149) * Fix most tests Signed-off-by: Ankita Katiyar * Fix most tests Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar * Returned constants to avoid breaking changes Signed-off-by: Elena Khaustova * Udapted KedroDataCatalog for recent changes Signed-off-by: Elena Khaustova * Minor fix Signed-off-by: Elena Khaustova * Updated test_sorting_order_with_other_dataset_through_extra_pattern Signed-off-by: Elena Khaustova * Removed odd properties Signed-off-by: Elena Khaustova * Updated tests Signed-off-by: Elena Khaustova * Removed None from _fetch_credentials input Signed-off-by: Elena Khaustova * Updated specs and context Signed-off-by: Elena Khaustova * Updated runners Signed-off-by: Elena Khaustova * Updated default catalog validation Signed-off-by: Elena Khaustova * Updated default catalog validation Signed-off-by: Elena Khaustova * Updated contains and added exists methods for KedroDataCatalog Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Fixing docs and lint Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Fixed unit tests Signed-off-by: Elena Khaustova * Added __eq__ Signed-off-by: Elena Khaustova * Renamed DataCatalogConfigResolver to CatalogConfigResolver Signed-off-by: Elena Khaustova * Renamed _init_configs to _resolve_config_credentials Signed-off-by: Elena Khaustova * Moved functions to the class Signed-off-by: Elena Khaustova * Refactored resolve_dataset_pattern Signed-off-by: Elena Khaustova * Fixed refactored part Signed-off-by: Elena Khaustova * Changed the order of arguments for DataCatalog constructor Signed-off-by: Elena Khaustova * Replaced __getitem__ with .get() Signed-off-by: Elena Khaustova * Updated catalog commands Signed-off-by: Elena Khaustova * Moved warm up block outside of the try block Signed-off-by: Elena Khaustova * Fixed linter Signed-off-by: Elena Khaustova * Removed odd copying Signed-off-by: Elena Khaustova * Renamed DataCatalogConfigResolver to CatalogConfigResolver Signed-off-by: Elena Khaustova * Renamed AbstractDataCatalog to BaseDataCatalog Signed-off-by: Elena Khaustova * Moved validate_dataset_config inside catalog Signed-off-by: Elena Khaustova * Renamed _init_dataset to _add_from_config Signed-off-by: Elena Khaustova * Fix lint Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Returned DatasetError Signed-off-by: Elena Khaustova * Added _dataset_patterns and _default_pattern to _config_resolver to avoid breaking change Signed-off-by: Elena Khaustova * Made resolve_dataset_pattern return just dict Signed-off-by: Elena Khaustova * Fixed linter Signed-off-by: Elena Khaustova * Added Catalogprotocol draft Signed-off-by: Elena Khaustova * Implemented CatalogProtocol Signed-off-by: Elena Khaustova * Updated types Signed-off-by: Elena Khaustova * Fixed linter Signed-off-by: Elena Khaustova * Added _ImplementsCatalogProtocolValidator Signed-off-by: Elena Khaustova * Updated docstrings Signed-off-by: Elena Khaustova * Fixed tests Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Excluded Potocol from coverage Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Renamed catalog source to kedro_data_catalog Signed-off-by: Elena Khaustova * Renamed data set to dataset in docstrings Signed-off-by: Elena Khaustova * Updated add_from_dict Signed-off-by: Elena Khaustova * Revised comments and TODOs Signed-off-by: Elena Khaustova * Updated error message to point to specific catalog type Signed-off-by: Elena Khaustova * Fixed tests Signed-off-by: Elena Khaustova * Merged with protocol Signed-off-by: Elena Khaustova * Removed reference to DataCatalog in docstrings Signed-off-by: Elena Khaustova * Fixed docs Signed-off-by: Elena Khaustova * Reordered methods Signed-off-by: Elena Khaustova * Removed add_all from protocol Signed-off-by: Elena Khaustova * Changed the order of arguments Signed-off-by: Elena Khaustova * Updated docstrings Signed-off-by: Elena Khaustova * Updated docstrings Signed-off-by: Elena Khaustova * Added __repr__ Signed-off-by: Elena Khaustova * Made __getitem__ return deepcopy Signed-off-by: Elena Khaustova * Fixed bug in get_dataset() Signed-off-by: Elena Khaustova * Fixed __eq__ Signed-off-by: Elena Khaustova * Fixed docstrings Signed-off-by: Elena Khaustova * Added __setitem__ Signed-off-by: Elena Khaustova * Unit tests for `KedroDataCatalog` (#4171) * Added KedroDataCatlog tests template Signed-off-by: Elena Khaustova * Added test save/load unregistered dataset Signed-off-by: Elena Khaustova * Added test_feed_dict Signed-off-by: Elena Khaustova * Added exists tests Signed-off-by: Elena Khaustova * Added tests for list() Signed-off-by: Elena Khaustova * Added test_eq Signed-off-by: Elena Khaustova * Added test init/add datasets Signed-off-by: Elena Khaustova * Updated test_adding_datasets_not_allowed Signed-off-by: Elena Khaustova * Added shallow copy tests Signed-off-by: Elena Khaustova * Added TestKedroDataCatalogFromConfig Signed-off-by: Elena Khaustova * Added missing tests Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova * Updated RELEASE.md Signed-off-by: Elena Khaustova * Removed deep copies Signed-off-by: Elena Khaustova * Removed some interface that will be changed in the next version Signed-off-by: Elena Khaustova * Removed key completions Signed-off-by: Elena Khaustova * Fixinf typos Signed-off-by: Elena Khaustova * Removed key completions test Signed-off-by: Elena Khaustova * Replaced data set with dataset Signed-off-by: Elena Khaustova * Added docstring for get_dataset() method Signed-off-by: Elena Khaustova * Renamed pytest fixture Signed-off-by: Elena Khaustova * Addressed review comments Signed-off-by: Elena Khaustova * Updated _assert_requirements_ok starters test Signed-off-by: Elena Khaustova * Revert "Updated _assert_requirements_ok starters test" This reverts commit 5208321a440432a9f11ec990c724b6ecf2fd7990. Signed-off-by: Elena Khaustova * Updated error message Signed-off-by: Elena Khaustova * Replaced typo Signed-off-by: Elena Khaustova * Replaced data set with dataset in docstrings Signed-off-by: Elena Khaustova * Updated tests Signed-off-by: Elena Khaustova * Made KedroDataCatalog subclass from CatalogProtocol Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Implemented iter, getitem, setitem Signed-off-by: Elena Khaustova * Updated add_data and TODOs Signed-off-by: Elena Khaustova * Added key completions Signed-off-by: Elena Khaustova * Maded behavior dict like Signed-off-by: Elena Khaustova * Merged with main Signed-off-by: Elena Khaustova * Removed add_data() method Signed-off-by: Elena Khaustova * Added usage example and updated docstrings with experimental feature note Signed-off-by: Elena Khaustova * Added len and get Signed-off-by: Elena Khaustova * Implemented unit tests Signed-off-by: Elena Khaustova * Update RELEASE.md Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> * Update kedro/io/kedro_data_catalog.py Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> * Fixed lint Signed-off-by: Elena Khaustova * Updated load_data and save_data to use new interface Signed-off-by: Elena Khaustova * Updated load_data and save_data to use new interface Signed-off-by: Elena Khaustova * Returned usage of get_dataset() Signed-off-by: Elena Khaustova * Fixed lint Signed-off-by: Elena Khaustova * Updated __getitem__ to use old get_dataset() method Signed-off-by: Elena Khaustova * Removed regex_search from values() Signed-off-by: Elena Khaustova * Fixed type annotation for __iter__ Signed-off-by: Elena Khaustova * Fixed linter Signed-off-by: Elena Khaustova * Revert lint fix Signed-off-by: Elena Khaustova * Returned short names for save and load Signed-off-by: Elena Khaustova * Removed regex_search from keys and items Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Maded regex_search non optional Signed-off-by: Elena Khaustova * Changed default for regex_flags Signed-off-by: Elena Khaustova * Returned list() method Signed-off-by: Elena Khaustova * Fixed __iter__ return type Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova Signed-off-by: Ankita Katiyar Signed-off-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> --- RELEASE.md | 4 + kedro/io/kedro_data_catalog.py | 148 +++++++++++++++++++++++----- tests/io/test_kedro_data_catalog.py | 35 ++++++- 3 files changed, 162 insertions(+), 25 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index a5e34a6ba8..f4b10035b7 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,10 @@ # Upcoming Release ## Major features and improvements +* Implemented dict-like interface for `KedroDataCatalog`. + +**Note:** ``KedroDataCatalog`` is an experimental feature and is under active development. Therefore, it is possible we'll introduce breaking changes to this class, so be mindful of that if you decide to use it already. Let us know if you have any feedback about the ``KedroDataCatalog`` or ideas for new features. + ## Bug fixes and other changes ## Breaking changes to the API ## Documentation changes diff --git a/kedro/io/kedro_data_catalog.py b/kedro/io/kedro_data_catalog.py index d07de8151a..c3d216abcd 100644 --- a/kedro/io/kedro_data_catalog.py +++ b/kedro/io/kedro_data_catalog.py @@ -14,7 +14,7 @@ import difflib import logging import re -from typing import Any +from typing import Any, Iterator, List # noqa: UP035 from kedro.io.catalog_config_resolver import CatalogConfigResolver, Patterns from kedro.io.core import ( @@ -84,10 +84,12 @@ def __init__( @property def datasets(self) -> dict[str, Any]: + # TODO: remove when removing old catalog return copy.copy(self._datasets) @datasets.setter def datasets(self, value: Any) -> None: + # TODO: remove when removing old catalog raise AttributeError( "Operation not allowed. Please use KedroDataCatalog.add() instead." ) @@ -112,6 +114,49 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other.config_resolver.list_patterns(), ) + def keys(self) -> List[str]: # noqa: UP006 + return list(self.__iter__()) + + def values(self) -> List[AbstractDataset]: # noqa: UP006 + return [self._datasets[key] for key in self] + + def items(self) -> List[tuple[str, AbstractDataset]]: # noqa: UP006 + return [(key, self._datasets[key]) for key in self] + + def __iter__(self) -> Iterator[str]: + yield from self._datasets.keys() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self.get_dataset(ds_name) + + def __setitem__(self, key: str, value: Any) -> None: + if key in self._datasets: + self._logger.warning("Replacing dataset '%s'", key) + if isinstance(value, AbstractDataset): + self._datasets[key] = value + else: + self._logger.info(f"Adding input data as a MemoryDataset - {key}") + self._datasets[key] = MemoryDataset(data=value) # type: ignore[abstract] + + def __len__(self) -> int: + return len(self.keys()) + + def get( + self, key: str, default: AbstractDataset | None = None + ) -> AbstractDataset | None: + """Get a dataset by name from an internal collection of datasets.""" + if key not in self._datasets: + ds_config = self._config_resolver.resolve_pattern(key) + if ds_config: + self._add_from_config(key, ds_config) + + dataset = self._datasets.get(key, None) + + return dataset or default + + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -178,6 +223,7 @@ def _add_from_config(self, ds_name: str, ds_config: dict[str, Any]) -> None: def get_dataset( self, ds_name: str, version: Version | None = None, suggest: bool = True ) -> AbstractDataset: + # TODO: remove when removing old catalog """Get a dataset by name from an internal collection of datasets. If a dataset is not in the collection but matches any pattern @@ -197,12 +243,7 @@ def get_dataset( DatasetNotFoundError: When a dataset with the given name is not in the collection and do not match patterns. """ - if ds_name not in self._datasets: - ds_config = self._config_resolver.resolve_pattern(ds_name) - if ds_config: - self._add_from_config(ds_name, ds_config) - - dataset = self._datasets.get(ds_name, None) + dataset = self.get(ds_name) if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" @@ -231,40 +272,71 @@ def _get_dataset( def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: + # TODO: remove when removing old catalog """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset - - def list(self, regex_search: str | None = None) -> list[str]: + if ds_name in self._datasets and not replace: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self.__setitem__(ds_name, dataset) + + def list( + self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0 + ) -> List[str]: # noqa: UP006 + # TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917 """ List of all dataset names registered in the catalog. This can be filtered by providing an optional regular expression which will only return matching keys. """ - if regex_search is None: - return list(self._datasets.keys()) + return self.keys() - if not regex_search.strip(): + if regex_search == "": self._logger.warning("The empty string will not match any datasets") return [] + if not regex_flags: + regex_flags = re.IGNORECASE + try: - pattern = re.compile(regex_search, flags=re.IGNORECASE) + pattern = re.compile(regex_search, flags=regex_flags) except re.error as exc: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + return [ds_name for ds_name in self.__iter__() if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: - """Save data to a registered dataset.""" + # TODO: rename input argument when breaking change: name -> ds_name + """Save data to a registered dataset. + + Args: + name: A dataset to be saved to. + data: A data object to be saved as configured in the registered + dataset. + + Raises: + DatasetNotFoundError: When a dataset with the given name + has not yet been registered. + + Example: + :: + + >>> import pandas as pd + >>> + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = pd.DataFrame({'col1': [1, 2], + >>> 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> catalog.save("cars", df) + """ dataset = self.get_dataset(name) self._logger.info( @@ -277,7 +349,35 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered dataset.""" + # TODO: rename input argument when breaking change: name -> ds_name + # TODO: remove version from input arguments when breaking change + """Loads a registered dataset. + + Args: + name: A dataset to be loaded. + version: Optional argument for concrete data version to be loaded. + Works only with versioned datasets. + + Returns: + The loaded data as configured. + + Raises: + DatasetNotFoundError: When a dataset with the given name + has not yet been registered. + + Example: + :: + + >>> from kedro.io import DataCatalog + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = catalog.load("cars") + """ load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index efa993bb0e..a53717f8ba 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -379,7 +379,7 @@ def test_config_invalid_dataset_config(self, correct_config): def test_empty_config(self): """Test empty config""" - assert KedroDataCatalog.from_config(None) + assert len(KedroDataCatalog.from_config(None)) == 0 def test_missing_credentials(self, correct_config): """Check the error if credentials can't be located""" @@ -502,6 +502,39 @@ def test_bad_confirm(self, correct_config, dataset_name, pattern): with pytest.raises(DatasetError, match=re.escape(pattern)): data_catalog.confirm(dataset_name) + def test_iteration(self, correct_config): + """Test iterate through keys, values and items.""" + data_catalog = KedroDataCatalog.from_config(**correct_config) + + for ds_name_cat, ds_name_config in zip( + data_catalog, correct_config["catalog"] + ): + assert ds_name_cat == ds_name_config + + for ds_name_cat, ds_name_config in zip( + data_catalog.keys(), correct_config["catalog"] + ): + assert ds_name_cat == ds_name_config + + for ds in data_catalog.values(): + assert isinstance(ds, CSVDataset) + + for ds_name, ds in data_catalog.items(): + assert isinstance(ds, CSVDataset) + assert ds_name in correct_config["catalog"] + + def test_getitem_setitem(self, correct_config): + """Test get and set item.""" + data_catalog = KedroDataCatalog.from_config(**correct_config) + data_catalog["test"] = 123 + assert isinstance(data_catalog["test"], MemoryDataset) + + def test_ipython_key_completions(self, correct_config): + data_catalog = KedroDataCatalog.from_config(**correct_config) + assert data_catalog._ipython_key_completions_() == list( + correct_config["catalog"].keys() + ) + class TestDataCatalogVersioned: def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): """Test load and save of versioned datasets from config"""