From d027ac590853f7213ce384683f8cef028521eb70 Mon Sep 17 00:00:00 2001 From: Ivan Danov Date: Thu, 16 May 2019 16:54:11 +0100 Subject: [PATCH 01/44] Merge pull request #19 from quantumblacklabs/release/0.14.0 Release 0.14.0 --- kedro/io/__init__.py | 55 ++++++ kedro/io/core.py | 450 +++++++++++++++++++++++++++++++++++++++++++ kedro/utils.py | 59 ++++++ 3 files changed, 564 insertions(+) create mode 100644 kedro/io/__init__.py create mode 100644 kedro/io/core.py create mode 100644 kedro/utils.py diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py new file mode 100644 index 000000000..a426a6374 --- /dev/null +++ b/kedro/io/__init__.py @@ -0,0 +1,55 @@ +# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo +# (either separately or in combination, “QuantumBlack Trademarks”) are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""``kedro.io`` provides functionality to read and write to a +number of data sets. At core of the library is ``AbstractDataSet`` +which allows implementation of various ``AbstractDataSet``s. +""" + +from .core import AbstractDataSet # NOQA +from .core import DataSetAlreadyExistsError # NOQA +from .core import DataSetError # NOQA +from .core import DataSetNotFoundError # NOQA +from .core import ExistsMixin # NOQA +from .core import FilepathVersionMixIn # NOQA +from .core import S3PathVersionMixIn # NOQA +from .core import Version # NOQA +from .csv_local import CSVLocalDataSet # NOQA +from .csv_s3 import CSVS3DataSet # NOQA +from .data_catalog import DataCatalog # NOQA +from .excel_local import ExcelLocalDataSet # NOQA +from .hdf_local import HDFLocalDataSet # NOQA +from .json_local import JSONLocalDataSet # NOQA +from .lambda_data_set import LambdaDataSet # NOQA +from .memory_data_set import MemoryDataSet # NOQA +from .parquet_local import ParquetLocalDataSet # NOQA +from .pickle_local import PickleLocalDataSet # NOQA +from .pickle_s3 import PickleS3DataSet # NOQA +from .sql import SQLQueryDataSet # NOQA +from .sql import SQLTableDataSet # NOQA +from .text_local import TextLocalDataSet # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py new file mode 100644 index 000000000..59a355b5b --- /dev/null +++ b/kedro/io/core.py @@ -0,0 +1,450 @@ +# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo +# (either separately or in combination, “QuantumBlack Trademarks”) are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module provides a set of classes which underpin the data loading and +saving functionality provided by ``kedro.io``. +""" + +import abc +import copy +import logging +from collections import namedtuple +from datetime import datetime, timezone +from glob import iglob +from pathlib import Path, PurePosixPath +from typing import Any, Dict, Type +from warnings import warn + +from kedro.utils import load_obj + +MAX_DESCRIPTION_LENGTH = 70 +VERSIONED_FLAG_KEY = "versioned" +VERSION_KEY = "version" + + +class DataSetError(Exception): + """``DataSetError`` raised by ``AbstractDataSet`` implementations + in case of failure of input/output methods. + + ``AbstractDataSet`` implementations should provide instructive + information in case of failure. + """ + + pass + + +class DataSetNotFoundError(DataSetError): + """``DataSetNotFoundError`` raised by ``DataCatalog`` class in case of + trying to use a non-existing data set. + """ + + pass + + +class DataSetAlreadyExistsError(DataSetError): + """``DataSetAlreadyExistsError`` raised by ``DataCatalog`` class in case + of trying to add a data set which already exists in the ``DataCatalog``. + """ + + pass + + +class AbstractDataSet(abc.ABC): + """``AbstractDataSet`` is the base class for all data set implementations. + All data set implementations should extend this abstract class + and implement the methods marked as abstract. + + Example: + :: + + >>> from kedro.io import AbstractDataSet + >>> import pandas as pd + >>> + >>> class MyOwnDataSet(AbstractDataSet): + >>> def __init__(self, param1, param2): + >>> self._param1 = param1 + >>> self._param2 = param2 + >>> + >>> def _load(self) -> pd.DataFrame: + >>> print("Dummy load: {}".format(self._param1)) + >>> return pd.DataFrame() + >>> + >>> def _save(self, df: pd.DataFrame) -> None: + >>> print("Dummy save: {}".format(self._param2)) + >>> + >>> def _describe(self): + >>> return dict(param1=self._param1, param2=self._param2) + """ + + @classmethod + def from_config( + cls: Type, + name: str, + config: Dict[str, Any], + load_version: str = None, + save_version: str = None, + ) -> "AbstractDataSet": + """Create a data set instance using the configuration provided. + + Args: + name: Data set name. + config: Data set config dictionary. + load_version: Version string to be used for ``load`` operation if + the data set is versioned. Has no effect on the data set + if versioning was not enabled. + save_version: Version string to be used for ``save`` operation if + the data set is versioned. Has no effect on the data set + if versioning was not enabled. + + Returns: + An instance of an ``AbstractDataSet`` subclass. + + Raises: + DataSetError: When the function fails to create the data set + from its config. + + """ + config = copy.deepcopy(config) + save_version = save_version or generate_current_version() + + if VERSION_KEY in config: + # remove "version" key so that it's not passed + # to the 'unversioned' data set constructor + message = ( + "`%s` attribute removed from `%s` data set " + "configuration since it is a reserved word and cannot " + "be directly specified", + VERSION_KEY, + name, + ) + logging.getLogger(__name__).warning(*message) + del config[VERSION_KEY] + if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned + config[VERSION_KEY] = Version(load_version, save_version) + + dataset_class_path = config.pop("type") + try: + class_obj = load_obj(dataset_class_path, "kedro.io") + except ImportError: + raise DataSetError( + "Cannot import module when trying to load type " + "`{}` for DataSet `{}`.".format(dataset_class_path, name) + ) + except AttributeError: + raise DataSetError( + "Class `{}` for DataSet `{}` not found.".format( + dataset_class_path, name + ) + ) + + if not issubclass(class_obj, AbstractDataSet): + raise DataSetError( + "DataSet '{}' type `{}.{}` is invalid: " + "all data set types must extend " + "`AbstractDataSet`.".format( + name, class_obj.__module__, class_obj.__qualname__ + ) + ) + try: + data_set = class_obj(**config) + except TypeError as err: + raise DataSetError( + "\n{}.\nDataSet '{}' must only contain " + "arguments valid for the constructor " + "of `{}.{}`.".format( + str(err), name, class_obj.__module__, class_obj.__qualname__ + ) + ) + except Exception as err: + raise DataSetError( + "\n{}.\nFailed to instantiate DataSet " + "'{}' of type `{}.{}`.".format( + str(err), name, class_obj.__module__, class_obj.__qualname__ + ) + ) + return data_set + + def load(self) -> Any: + """Loads data by delegation to the provided load method. + + Returns: + Data returned by the provided load method. + + Raises: + DataSetError: When underlying load method raises error. + + """ + + try: + logging.getLogger(__name__).debug("Loading %s", str(self)) + return self._load() + except DataSetError: + raise + except Exception as exc: + # This exception handling is by design as the composed data sets + # can throw any type of exception. + message = "Failed while loading data from data set {}.\n{}".format( + str(self), str(exc) + ) + raise DataSetError(message) from exc + + def save(self, data: Any) -> None: + """Saves data by delegation to the provided save method. + + Args: + data: the value to be saved by provided save method. + + Raises: + DataSetError: when underlying save method raises error. + + """ + + if data is None: + raise DataSetError("Saving `None` to a `DataSet` is not allowed") + + try: + logging.getLogger(__name__).debug("Saving %s", str(self)) + self._save(data) + except DataSetError: + raise + except Exception as exc: + message = "Failed while saving data to data set {}.\n{}".format( + str(self), str(exc) + ) + raise DataSetError(message) from exc + + def __str__(self): + def _to_str(obj, is_root=False): + """Returns a string representation where + 1. The root level (i.e. the DataSet.__init__ arguments) are + formatted like DataSet(key=value). + 2. Dictionaries have the keys alphabetically sorted recursively. + 3. Empty dictionaries and None values are not shown. + 4. String representations of dictionary values are + capped to MAX_DESCRIPTION_LENGTH. + """ + + fmt = "{}={}" if is_root else "'{}': {}" # 1 + + if isinstance(obj, dict): + sorted_dict = sorted(obj.items(), key=lambda pair: str(pair[0])) # 2 + + text = ", ".join( + fmt.format(key, _to_str(value)) # 2 + for key, value in sorted_dict + if value or isinstance(value, bool) + ) # 3 + + return text if is_root else "{" + text + "}" # 1 + + # not a dictionary + value = str(obj) + suffix = "" if len(value) <= MAX_DESCRIPTION_LENGTH else "..." + return value[:MAX_DESCRIPTION_LENGTH] + suffix # 4 + + return "{}({})".format(type(self).__name__, _to_str(self._describe(), True)) + + @abc.abstractmethod + def _load(self) -> Any: + raise NotImplementedError( + "`{}` is a subclass of AbstractDataSet and" + "it must implement the `_load` method".format(self.__class__.__name__) + ) + + @abc.abstractmethod + def _save(self, data: Any) -> None: + raise NotImplementedError( + "`{}` is a subclass of AbstractDataSet and" + "it must implement the `_save` method".format(self.__class__.__name__) + ) + + @abc.abstractmethod + def _describe(self) -> Dict[str, Any]: + raise NotImplementedError( + "`{}` is a subclass of AbstractDataSet and" + "it must implement the `_describe` method".format(self.__class__.__name__) + ) + + +class ExistsMixin(abc.ABC): + """Mixin class which provides an exists() method.""" + + def exists(self) -> bool: + """Checks whether a data set's output already exists by calling + the provided _exists() method. + + Returns: + Flag indicating whether the output already exists. + + Raises: + DataSetError: when underlying exists method raises error. + + """ + try: + logging.getLogger(__name__).debug( + "Checking whether target of %s exists", str(self) + ) + return self._exists() + except Exception as exc: + message = "Failed during exists check for data set {}.\n{}".format( + str(self), str(exc) + ) + raise DataSetError(message) from exc + + @abc.abstractmethod + def _exists(self) -> bool: + raise NotImplementedError( + "`{}` inherits from ExistsMixin and " + "it must implement the `_exists` method".format(self.__class__.__name__) + ) + + +def generate_current_version() -> str: + """Generate the current version to be used by versioned data sets. + + Returns: + String representation of the current version. + + """ + current_ts = datetime.now(tz=timezone.utc) + fmt = ( + "{d.year:04d}-{d.month:02d}-{d.day:02d}T{d.hour:02d}" + ".{d.minute:02d}.{d.second:02d}.{ms:03d}Z" + ) + return fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) + + +class Version(namedtuple("Version", ["load", "save"])): + """This namedtuple is used to provide load and save versions for versioned + data sets. If ``Version.load`` is None, then the latest available version + is loaded. If ``Version.save`` is None, then save version is formatted as + YYYY-MM-DDThh.mm.ss.sssZ of the current timestamp. + """ + + __slots__ = () + + +_PATH_CONSISTENCY_WARNING = ( + "Save path `{}` did not match load path `{}` for {}. This is strongly " + "discouraged due to inconsistencies it may cause between `save` and " + "`load` operations. Please refrain from setting exact load version for " + "intermediate data sets where possible to avoid this warning." +) + + +# pylint: disable=too-few-public-methods +class FilepathVersionMixIn: + """Mixin class which helps to version filepath-like data sets.""" + + def _get_load_path(self, filepath: str, version: Version = None) -> str: + if not version: + return filepath + if version.load: + return self._get_versioned_path(filepath, version.load) + pattern = self._get_versioned_path(filepath, "*") + paths = [f for f in iglob(pattern) if Path(f).exists()] + if not paths: + message = "Did not find any versions for {}".format(str(self)) + raise DataSetError(message) + return sorted(paths, reverse=True)[0] + + def _get_save_path(self, filepath: str, version: Version = None) -> str: + if not version: + return filepath + save_version = version.save or generate_current_version() + versioned_path = self._get_versioned_path(filepath, save_version) + if Path(versioned_path).exists(): + message = ( + "Save path `{}` for {} must not exist if versioning " + "is enabled.".format(versioned_path, str(self)) + ) + raise DataSetError(message) + return versioned_path + + @staticmethod + def _get_versioned_path(filepath: str, version: str) -> str: + filepath = Path(filepath) + return str(filepath / version / filepath.name) + + def _check_paths_consistency(self, load_path: str, save_path: str): + if load_path != save_path: + warn(_PATH_CONSISTENCY_WARNING.format(save_path, load_path, str(self))) + + +# pylint: disable=too-few-public-methods +class S3PathVersionMixIn: + """Mixin class which helps to version S3 data sets.""" + + def _get_load_path( + self, client: Any, bucket: str, filepath: str, version: Version = None + ) -> str: + if not version: + return filepath + if version.load: + return self._get_versioned_path(filepath, version.load) + prefix = filepath if filepath.endswith("/") else filepath + "/" + keys = list(self._list_objects(client, bucket, prefix)) + if not keys: + message = "Did not find any versions for {}".format(str(self)) + raise DataSetError(message) + return sorted(keys, reverse=True)[0] + + def _get_save_path( + self, client: Any, bucket: str, filepath: str, version: Version = None + ) -> str: + if not version: + return filepath + save_version = version.save or generate_current_version() + versioned_path = self._get_versioned_path(filepath, save_version) + if versioned_path in self._list_objects(client, bucket, versioned_path): + message = ( + "Save path `{}` for {} must not exist if versioning " + "is enabled.".format(versioned_path, str(self)) + ) + raise DataSetError(message) + return versioned_path + + def _check_paths_consistency(self, load_path: str, save_path: str): + if load_path != save_path: + warn(_PATH_CONSISTENCY_WARNING.format(save_path, load_path, str(self))) + + @staticmethod + def _get_versioned_path(filepath: str, version: str) -> str: + filepath = PurePosixPath(filepath) + return str(filepath / version / filepath.name) + + @staticmethod + def _list_objects(client: Any, bucket: str, prefix: str): + paginator = client.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) + for page in page_iterator: + yield from ( + obj["Key"] + for obj in page.get("Contents", []) + if not obj["Key"].endswith("/") + ) diff --git a/kedro/utils.py b/kedro/utils.py new file mode 100644 index 000000000..63a0b8362 --- /dev/null +++ b/kedro/utils.py @@ -0,0 +1,59 @@ +# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo +# (either separately or in combination, “QuantumBlack Trademarks”) are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This module provides a set of helper functions being used across different components +of kedro package. +""" + +import importlib +from typing import Any + + +def load_obj(obj_path: str, default_obj_path: str) -> Any: + """Extract an object from a given path. + + Args: + obj_path: Path to an object to be extracted, including the object name. + default_obj_path: Default object path. + + Returns: + Extracted object. + + Raises: + AttributeError: When the object does not have the given named attribute. + + """ + obj_path_list = obj_path.rsplit(".", 1) + obj_path = obj_path_list.pop(0) if len(obj_path_list) > 1 else default_obj_path + obj_name = obj_path_list[0] + module_obj = importlib.import_module(obj_path) + if not hasattr(module_obj, obj_name): + raise AttributeError( + "Object `{}` cannot be loaded from `{}`.".format(obj_name, obj_path) + ) + return getattr(module_obj, obj_name) From c947de919383735b2a411fe8d52aee9ad31fe24c Mon Sep 17 00:00:00 2001 From: Nasef Khan Date: Fri, 31 May 2019 11:39:15 +0100 Subject: [PATCH 02/44] Merge pull request #51 from quantumblacklabs/release/0.14.1 Release 0.14.1 --- kedro/io/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index a426a6374..119f08bcb 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -44,6 +44,7 @@ from .data_catalog import DataCatalog # NOQA from .excel_local import ExcelLocalDataSet # NOQA from .hdf_local import HDFLocalDataSet # NOQA +from .hdf_s3 import HDFS3DataSet # NOQA from .json_local import JSONLocalDataSet # NOQA from .lambda_data_set import LambdaDataSet # NOQA from .memory_data_set import MemoryDataSet # NOQA From 8b218eceadd95c98858ace5beff5fda28a696bb7 Mon Sep 17 00:00:00 2001 From: Nasef Khan Date: Tue, 11 Jun 2019 15:14:58 +0100 Subject: [PATCH 03/44] Merge pull request #81 from quantumblacklabs/release/0.14.2 Release 0.14.2 --- kedro/io/__init__.py | 2 +- kedro/io/core.py | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 119f08bcb..2c9436ab2 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -35,7 +35,6 @@ from .core import DataSetAlreadyExistsError # NOQA from .core import DataSetError # NOQA from .core import DataSetNotFoundError # NOQA -from .core import ExistsMixin # NOQA from .core import FilepathVersionMixIn # NOQA from .core import S3PathVersionMixIn # NOQA from .core import Version # NOQA @@ -54,3 +53,4 @@ from .sql import SQLQueryDataSet # NOQA from .sql import SQLTableDataSet # NOQA from .text_local import TextLocalDataSet # NOQA +from .transformers import AbstractTransformer # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py index 59a355b5b..45ec91597 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -290,10 +290,6 @@ def _describe(self) -> Dict[str, Any]: "it must implement the `_describe` method".format(self.__class__.__name__) ) - -class ExistsMixin(abc.ABC): - """Mixin class which provides an exists() method.""" - def exists(self) -> bool: """Checks whether a data set's output already exists by calling the provided _exists() method. @@ -316,12 +312,12 @@ def exists(self) -> bool: ) raise DataSetError(message) from exc - @abc.abstractmethod def _exists(self) -> bool: - raise NotImplementedError( - "`{}` inherits from ExistsMixin and " - "it must implement the `_exists` method".format(self.__class__.__name__) + logging.getLogger(__name__).warning( + "`exists()` not implemented for `%s`. " "Assuming output does not exist.", + self.__class__.__name__, ) + return False def generate_current_version() -> str: From c9ca3693cc029fcb9ae30f51da41469a2a8d35bb Mon Sep 17 00:00:00 2001 From: Nasef Khan Date: Wed, 26 Jun 2019 11:19:37 +0100 Subject: [PATCH 04/44] Merge pull request #105 from quantumblacklabs/release/0.14.3 Release 0.14.3 --- kedro/io/__init__.py | 4 ++-- kedro/io/core.py | 23 +++++++++++++++++++++-- kedro/utils.py | 4 ++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 2c9436ab2..0707691f9 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -14,8 +14,8 @@ # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo -# (either separately or in combination, “QuantumBlack Trademarks”) are +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, diff --git a/kedro/io/core.py b/kedro/io/core.py index 45ec91597..b0a347b77 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -14,8 +14,8 @@ # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo -# (either separately or in combination, “QuantumBlack Trademarks”) are +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, @@ -319,6 +319,25 @@ def _exists(self) -> bool: ) return False + def release(self) -> bool: + """Release any cached data. + + Raises: + DataSetError: when underlying exists method raises error. + + """ + try: + logging.getLogger(__name__).debug("Releasing %s", str(self)) + self._release() + except Exception as exc: + message = "Failed during release for data set {}.\n{}".format( + str(self), str(exc) + ) + raise DataSetError(message) from exc + + def _release(self) -> None: + pass + def generate_current_version() -> str: """Generate the current version to be used by versioned data sets. diff --git a/kedro/utils.py b/kedro/utils.py index 63a0b8362..c126cb916 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -14,8 +14,8 @@ # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # -# The QuantumBlack Visual Analytics Limited (“QuantumBlack”) name and logo -# (either separately or in combination, “QuantumBlack Trademarks”) are +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, From f06a603bde910b6ada105c5ba80216989bd3c863 Mon Sep 17 00:00:00 2001 From: "Kiyohito Kunii (Kiyo)" <8097799+921kiyo@users.noreply.github.com> Date: Tue, 13 Aug 2019 15:00:59 +0100 Subject: [PATCH 05/44] Merge pull request #184 from quantumblacklabs/release/0.15.0 Release 0.15.0 --- kedro/io/__init__.py | 4 +- kedro/io/core.py | 184 ++++++++++++++++++++++++------------------- kedro/utils.py | 2 +- 3 files changed, 105 insertions(+), 85 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 0707691f9..ba16c0651 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -32,12 +32,12 @@ """ from .core import AbstractDataSet # NOQA +from .core import AbstractVersionedDataSet # NOQA from .core import DataSetAlreadyExistsError # NOQA from .core import DataSetError # NOQA from .core import DataSetNotFoundError # NOQA -from .core import FilepathVersionMixIn # NOQA -from .core import S3PathVersionMixIn # NOQA from .core import Version # NOQA +from .csv_http import CSVHTTPDataSet # NOQA from .csv_local import CSVLocalDataSet # NOQA from .csv_s3 import CSVS3DataSet # NOQA from .data_catalog import DataCatalog # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py index b0a347b77..029bb87da 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -33,11 +33,13 @@ import abc import copy import logging +import os from collections import namedtuple from datetime import datetime, timezone from glob import iglob -from pathlib import Path, PurePosixPath -from typing import Any, Dict, Type +from pathlib import Path, PurePath +from typing import Any, Callable, Dict, List, Optional, Type +from urllib.parse import urlparse from warnings import warn from kedro.utils import load_obj @@ -314,12 +316,12 @@ def exists(self) -> bool: def _exists(self) -> bool: logging.getLogger(__name__).warning( - "`exists()` not implemented for `%s`. " "Assuming output does not exist.", + "`exists()` not implemented for `%s`. Assuming output does not exist.", self.__class__.__name__, ) return False - def release(self) -> bool: + def release(self) -> None: """Release any cached data. Raises: @@ -372,94 +374,112 @@ class Version(namedtuple("Version", ["load", "save"])): ) -# pylint: disable=too-few-public-methods -class FilepathVersionMixIn: - """Mixin class which helps to version filepath-like data sets.""" +def _local_exists(filepath: str) -> bool: + return Path(filepath).exists() + + +def is_remote_path(filepath: str) -> bool: + """ + Check if the given path looks like a remote URL (has scheme). + """ + # Get rid of Windows-specific "C:\" start, + # which is treated as a URL scheme. + _, filepath = os.path.splitdrive(filepath) + return bool(urlparse(filepath).scheme) + + +class AbstractVersionedDataSet(AbstractDataSet): + """ + ``AbstractVersionedDataSet`` is the base class for all versioned data set implementations. + All data sets that implement versioning should extend this abstract class + and implement the methods marked as abstract. + + Example: + :: + + >>> from kedro.io import AbstractVersionedDataSet + >>> import pandas as pd + >>> + >>> class MyOwnDataSet(AbstractVersionedDataSet): + >>> def __init__(self, param1, param2, filepath, version): + >>> super().__init__(filepath, version) + >>> self._param1 = param1 + >>> self._param2 = param2 + >>> + >>> def _load(self) -> pd.DataFrame: + >>> load_path = self._get_load_path() + >>> return pd.read_csv(load_path) + >>> + >>> def _save(self, df: pd.DataFrame) -> None: + >>> save_path = self._get_save_path() + >>> df.to_csv(save_path) + >>> + >>> def _describe(self): + >>> return dict(version=self._version, param1=self._param1, param2=self._param2) + """ + + # pylint: disable=abstract-method + + def __init__( + self, + filepath: PurePath, + version: Optional[Version], + exists_function: Callable[[str], bool] = None, + glob_function: Callable[[str], List[str]] = None, + ): + """Creates a new instance of ``AbstractVersionedDataSet``. + + Args: + filepath: Path to file. + version: If specified, should be an instance of + ``kedro.io.core.Version``. If its ``load`` attribute is + None, the latest version will be loaded. If its ``save`` + attribute is None, save version will be autogenerated. + exists_function: Function that is used for determining whether + a path exists in a filesystem. + glob_function: Function that is used for finding all paths + in a filesystem, which match a given pattern. + """ + self._filepath = filepath + self._version = version + self._exists_function = exists_function or _local_exists + self._glob_function = glob_function or iglob + + def _get_load_path(self) -> PurePath: + if not self._version: + return self._filepath + if self._version.load: + return self._get_versioned_path(self._version.load) + + pattern = str(self._get_versioned_path("*")) + paths = [ + path for path in self._glob_function(pattern) if self._exists_function(path) + ] - def _get_load_path(self, filepath: str, version: Version = None) -> str: - if not version: - return filepath - if version.load: - return self._get_versioned_path(filepath, version.load) - pattern = self._get_versioned_path(filepath, "*") - paths = [f for f in iglob(pattern) if Path(f).exists()] if not paths: - message = "Did not find any versions for {}".format(str(self)) - raise DataSetError(message) - return sorted(paths, reverse=True)[0] - - def _get_save_path(self, filepath: str, version: Version = None) -> str: - if not version: - return filepath - save_version = version.save or generate_current_version() - versioned_path = self._get_versioned_path(filepath, save_version) - if Path(versioned_path).exists(): - message = ( - "Save path `{}` for {} must not exist if versioning " - "is enabled.".format(versioned_path, str(self)) - ) - raise DataSetError(message) - return versioned_path + raise DataSetError("Did not find any versions for {}".format(str(self))) - @staticmethod - def _get_versioned_path(filepath: str, version: str) -> str: - filepath = Path(filepath) - return str(filepath / version / filepath.name) + most_recent = sorted(paths, reverse=True)[0] + return PurePath(most_recent) - def _check_paths_consistency(self, load_path: str, save_path: str): - if load_path != save_path: - warn(_PATH_CONSISTENCY_WARNING.format(save_path, load_path, str(self))) + def _get_save_path(self) -> PurePath: + if not self._version: + return self._filepath + save_version = self._version.save or generate_current_version() + versioned_path = self._get_versioned_path(save_version) -# pylint: disable=too-few-public-methods -class S3PathVersionMixIn: - """Mixin class which helps to version S3 data sets.""" - - def _get_load_path( - self, client: Any, bucket: str, filepath: str, version: Version = None - ) -> str: - if not version: - return filepath - if version.load: - return self._get_versioned_path(filepath, version.load) - prefix = filepath if filepath.endswith("/") else filepath + "/" - keys = list(self._list_objects(client, bucket, prefix)) - if not keys: - message = "Did not find any versions for {}".format(str(self)) - raise DataSetError(message) - return sorted(keys, reverse=True)[0] - - def _get_save_path( - self, client: Any, bucket: str, filepath: str, version: Version = None - ) -> str: - if not version: - return filepath - save_version = version.save or generate_current_version() - versioned_path = self._get_versioned_path(filepath, save_version) - if versioned_path in self._list_objects(client, bucket, versioned_path): - message = ( + if self._exists_function(str(versioned_path)): + raise DataSetError( "Save path `{}` for {} must not exist if versioning " "is enabled.".format(versioned_path, str(self)) ) - raise DataSetError(message) + return versioned_path - def _check_paths_consistency(self, load_path: str, save_path: str): + def _get_versioned_path(self, version: str) -> PurePath: + return self._filepath / version / self._filepath.name + + def _check_paths_consistency(self, load_path: PurePath, save_path: PurePath): if load_path != save_path: warn(_PATH_CONSISTENCY_WARNING.format(save_path, load_path, str(self))) - - @staticmethod - def _get_versioned_path(filepath: str, version: str) -> str: - filepath = PurePosixPath(filepath) - return str(filepath / version / filepath.name) - - @staticmethod - def _list_objects(client: Any, bucket: str, prefix: str): - paginator = client.get_paginator("list_objects_v2") - page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) - for page in page_iterator: - yield from ( - obj["Key"] - for obj in page.get("Contents", []) - if not obj["Key"].endswith("/") - ) diff --git a/kedro/utils.py b/kedro/utils.py index c126cb916..9a9621f96 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -34,7 +34,7 @@ from typing import Any -def load_obj(obj_path: str, default_obj_path: str) -> Any: +def load_obj(obj_path: str, default_obj_path: str = "") -> Any: """Extract an object from a given path. Args: From f69bd54c8b9de8c18ac76ff78f4f3072b61b55ba Mon Sep 17 00:00:00 2001 From: Anton Kirilenko Date: Thu, 12 Sep 2019 16:41:03 +0100 Subject: [PATCH 06/44] Merge pull request #232 from quantumblacklabs/release/0.15.1 [KED-1003] Release 0.15.1 --- kedro/io/core.py | 48 ++++++++++++++++++++++++++++++++++++++++-------- kedro/utils.py | 1 - 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 029bb87da..ba5f053c6 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -191,6 +191,12 @@ def from_config( ) return data_set + def get_last_load_version(self) -> Optional[str]: + """Versioned datasets should override this property to return last loaded + version""" + # pylint: disable=no-self-use + return None # pragma: no cover + def load(self) -> Any: """Loads data by delegation to the provided load method. @@ -215,6 +221,12 @@ def load(self) -> Any: ) raise DataSetError(message) from exc + def get_last_save_version(self) -> Optional[str]: + """Versioned datasets should override this property to return last saved + version.""" + # pylint: disable=no-self-use + return None # pragma: no cover + def save(self, data: Any) -> None: """Saves data by delegation to the provided save method. @@ -444,31 +456,51 @@ def __init__( self._version = version self._exists_function = exists_function or _local_exists self._glob_function = glob_function or iglob + self._last_load_version = None # type: Optional[str] + self._last_save_version = None # type: Optional[str] + + def get_last_load_version(self) -> Optional[str]: + return self._last_load_version def _get_load_path(self) -> PurePath: if not self._version: + # When versioning is disabled, load from provided filepath + self._last_load_version = None return self._filepath + if self._version.load: + # When load version is pinned, get versioned path + self._last_load_version = self._version.load return self._get_versioned_path(self._version.load) + # When load version is unpinned, fetch the most recent existing + # version from the given path pattern = str(self._get_versioned_path("*")) - paths = [ - path for path in self._glob_function(pattern) if self._exists_function(path) - ] + version_paths = sorted(self._glob_function(pattern), reverse=True) + most_recent = next( + (path for path in version_paths if self._exists_function(path)), None + ) - if not paths: + if not most_recent: raise DataSetError("Did not find any versions for {}".format(str(self))) - most_recent = sorted(paths, reverse=True)[0] - return PurePath(most_recent) + versioned_path = PurePath(most_recent) + self._last_load_version = versioned_path.parent.name + + return versioned_path + + def get_last_save_version(self) -> Optional[str]: + return self._last_save_version def _get_save_path(self) -> PurePath: if not self._version: + # When versioning is disabled, return given filepath + self._last_save_version = None return self._filepath - save_version = self._version.save or generate_current_version() - versioned_path = self._get_versioned_path(save_version) + self._last_save_version = self._version.save or generate_current_version() + versioned_path = self._get_versioned_path(self._last_save_version) if self._exists_function(str(versioned_path)): raise DataSetError( "Save path `{}` for {} must not exist if versioning " diff --git a/kedro/utils.py b/kedro/utils.py index 9a9621f96..a4e0d07a3 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -29,7 +29,6 @@ """This module provides a set of helper functions being used across different components of kedro package. """ - import importlib from typing import Any From f4e6ea88946f739cc9519b1adc834e4111bcc799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lorena=20B=C4=83lan?= Date: Tue, 8 Oct 2019 17:01:46 +0100 Subject: [PATCH 07/44] Release 0.15.2 --- kedro/io/core.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index ba5f053c6..4885add2a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -132,7 +132,7 @@ def from_config( """ config = copy.deepcopy(config) - save_version = save_version or generate_current_version() + save_version = save_version or generate_timestamp() if VERSION_KEY in config: # remove "version" key so that it's not passed @@ -191,6 +191,10 @@ def from_config( ) return data_set + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + def get_last_load_version(self) -> Optional[str]: """Versioned datasets should override this property to return last loaded version""" @@ -209,7 +213,7 @@ def load(self) -> Any: """ try: - logging.getLogger(__name__).debug("Loading %s", str(self)) + self._logger.debug("Loading %s", str(self)) return self._load() except DataSetError: raise @@ -242,7 +246,7 @@ def save(self, data: Any) -> None: raise DataSetError("Saving `None` to a `DataSet` is not allowed") try: - logging.getLogger(__name__).debug("Saving %s", str(self)) + self._logger.debug("Saving %s", str(self)) self._save(data) except DataSetError: raise @@ -316,9 +320,7 @@ def exists(self) -> bool: """ try: - logging.getLogger(__name__).debug( - "Checking whether target of %s exists", str(self) - ) + self._logger.debug("Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: message = "Failed during exists check for data set {}.\n{}".format( @@ -327,7 +329,7 @@ def exists(self) -> bool: raise DataSetError(message) from exc def _exists(self) -> bool: - logging.getLogger(__name__).warning( + self._logger.warning( "`exists()` not implemented for `%s`. Assuming output does not exist.", self.__class__.__name__, ) @@ -341,7 +343,7 @@ def release(self) -> None: """ try: - logging.getLogger(__name__).debug("Releasing %s", str(self)) + self._logger.debug("Releasing %s", str(self)) self._release() except Exception as exc: message = "Failed during release for data set {}.\n{}".format( @@ -353,11 +355,11 @@ def _release(self) -> None: pass -def generate_current_version() -> str: - """Generate the current version to be used by versioned data sets. +def generate_timestamp() -> str: + """Generate the timestamp to be used by versioning. Returns: - String representation of the current version. + String representation of the current timestamp. """ current_ts = datetime.now(tz=timezone.utc) @@ -387,7 +389,8 @@ class Version(namedtuple("Version", ["load", "save"])): def _local_exists(filepath: str) -> bool: - return Path(filepath).exists() + filepath = Path(filepath) + return filepath.exists() or any(par.is_file() for par in filepath.parents) def is_remote_path(filepath: str) -> bool: @@ -400,11 +403,11 @@ def is_remote_path(filepath: str) -> bool: return bool(urlparse(filepath).scheme) -class AbstractVersionedDataSet(AbstractDataSet): +class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): """ - ``AbstractVersionedDataSet`` is the base class for all versioned data set implementations. - All data sets that implement versioning should extend this abstract class - and implement the methods marked as abstract. + ``AbstractVersionedDataSet`` is the base class for all versioned data set + implementations. All data sets that implement versioning should extend this + abstract class and implement the methods marked as abstract. Example: :: @@ -498,7 +501,7 @@ def _get_save_path(self) -> PurePath: self._last_save_version = None return self._filepath - self._last_save_version = self._version.save or generate_current_version() + self._last_save_version = self._version.save or generate_timestamp() versioned_path = self._get_versioned_path(self._last_save_version) if self._exists_function(str(versioned_path)): From fc68c721b2a560a792a16d07a5087977f7ce66c9 Mon Sep 17 00:00:00 2001 From: Dmitrii Deriabin <44967953+DmitriiDeriabinQB@users.noreply.github.com> Date: Wed, 30 Oct 2019 16:50:41 +0000 Subject: [PATCH 08/44] Merge pull request #307 from quantumblacklabs/release/0.15.4 Release 0.15.4 --- kedro/io/core.py | 108 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 4885add2a..996945d0a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -44,7 +44,6 @@ from kedro.utils import load_obj -MAX_DESCRIPTION_LENGTH = 70 VERSIONED_FLAG_KEY = "versioned" VERSION_KEY = "version" @@ -76,6 +75,14 @@ class DataSetAlreadyExistsError(DataSetError): pass +class VersionNotFoundError(DataSetError): + """``VersionNotFoundError`` raised by ``AbstractVersionedDataSet`` implementations + in case of no load versions available for the data set. + """ + + pass + + class AbstractDataSet(abc.ABC): """``AbstractDataSet`` is the base class for all data set implementations. All data set implementations should extend this abstract class @@ -212,8 +219,9 @@ def load(self) -> Any: """ + self._logger.debug("Loading %s", str(self)) + try: - self._logger.debug("Loading %s", str(self)) return self._load() except DataSetError: raise @@ -263,8 +271,6 @@ def _to_str(obj, is_root=False): formatted like DataSet(key=value). 2. Dictionaries have the keys alphabetically sorted recursively. 3. Empty dictionaries and None values are not shown. - 4. String representations of dictionary values are - capped to MAX_DESCRIPTION_LENGTH. """ fmt = "{}={}" if is_root else "'{}': {}" # 1 @@ -281,9 +287,7 @@ def _to_str(obj, is_root=False): return text if is_root else "{" + text + "}" # 1 # not a dictionary - value = str(obj) - suffix = "" if len(value) <= MAX_DESCRIPTION_LENGTH else "..." - return value[:MAX_DESCRIPTION_LENGTH] + suffix # 4 + return str(obj) return "{}({})".format(type(self).__name__, _to_str(self._describe(), True)) @@ -380,8 +384,8 @@ class Version(namedtuple("Version", ["load", "save"])): __slots__ = () -_PATH_CONSISTENCY_WARNING = ( - "Save path `{}` did not match load path `{}` for {}. This is strongly " +CONSISTENCY_WARNING = ( + "Save version `{}` did not match load version `{}` for {}. This is strongly " "discouraged due to inconsistencies it may cause between `save` and " "`load` operations. Please refrain from setting exact load version for " "intermediate data sets where possible to avoid this warning." @@ -415,6 +419,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> from kedro.io import AbstractVersionedDataSet >>> import pandas as pd >>> + >>> >>> class MyOwnDataSet(AbstractVersionedDataSet): >>> def __init__(self, param1, param2, filepath, version): >>> super().__init__(filepath, version) @@ -427,7 +432,11 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> >>> def _save(self, df: pd.DataFrame) -> None: >>> save_path = self._get_save_path() - >>> df.to_csv(save_path) + >>> df.to_csv(str(save_path)) + >>> + >>> def _exists(self) -> bool: + >>> path = self._get_load_path() + >>> return path.is_file() >>> >>> def _describe(self): >>> return dict(version=self._version, param1=self._param1, param2=self._param2) @@ -465,16 +474,11 @@ def __init__( def get_last_load_version(self) -> Optional[str]: return self._last_load_version - def _get_load_path(self) -> PurePath: + def _lookup_load_version(self) -> Optional[str]: if not self._version: - # When versioning is disabled, load from provided filepath - self._last_load_version = None - return self._filepath - + return None if self._version.load: - # When load version is pinned, get versioned path - self._last_load_version = self._version.load - return self._get_versioned_path(self._version.load) + return self._version.load # When load version is unpinned, fetch the most recent existing # version from the given path @@ -485,25 +489,35 @@ def _get_load_path(self) -> PurePath: ) if not most_recent: - raise DataSetError("Did not find any versions for {}".format(str(self))) + raise VersionNotFoundError( + "Did not find any versions for {}".format(str(self)) + ) - versioned_path = PurePath(most_recent) - self._last_load_version = versioned_path.parent.name + return PurePath(most_recent).parent.name - return versioned_path + def _get_load_path(self) -> PurePath: + if not self._version: + # When versioning is disabled, load from original filepath + return self._filepath + + load_version = self._last_load_version or self._lookup_load_version() + return self._get_versioned_path(load_version) # type: ignore def get_last_save_version(self) -> Optional[str]: return self._last_save_version + def _lookup_save_version(self) -> Optional[str]: + if not self._version: + return None + return self._version.save or generate_timestamp() + def _get_save_path(self) -> PurePath: if not self._version: - # When versioning is disabled, return given filepath - self._last_save_version = None + # When versioning is disabled, return original filepath return self._filepath - self._last_save_version = self._version.save or generate_timestamp() - - versioned_path = self._get_versioned_path(self._last_save_version) + save_version = self._last_save_version or self._lookup_save_version() + versioned_path = self._get_versioned_path(save_version) # type: ignore if self._exists_function(str(versioned_path)): raise DataSetError( "Save path `{}` for {} must not exist if versioning " @@ -515,6 +529,40 @@ def _get_save_path(self) -> PurePath: def _get_versioned_path(self, version: str) -> PurePath: return self._filepath / version / self._filepath.name - def _check_paths_consistency(self, load_path: PurePath, save_path: PurePath): - if load_path != save_path: - warn(_PATH_CONSISTENCY_WARNING.format(save_path, load_path, str(self))) + def load(self) -> Any: + self._last_load_version = self._lookup_load_version() + return super().load() + + def save(self, data: Any) -> None: + self._last_save_version = self._lookup_save_version() + super().save(data) + + load_version = self._lookup_load_version() + if load_version != self._last_save_version: + warn( + CONSISTENCY_WARNING.format( + self._last_save_version, load_version, str(self) + ) + ) + + def exists(self) -> bool: + """Checks whether a data set's output already exists by calling + the provided _exists() method. + + Returns: + Flag indicating whether the output already exists. + + Raises: + DataSetError: when underlying exists method raises error. + + """ + self._logger.debug("Checking whether target of %s exists", str(self)) + try: + return self._exists() + except VersionNotFoundError: + return False + except Exception as exc: + message = "Failed during exists check for data set {}.\n{}".format( + str(self), str(exc) + ) + raise DataSetError(message) from exc From e3cf344308653d4c9316b6bd4d7bb85ddae7c859 Mon Sep 17 00:00:00 2001 From: andrii-ivaniuk Date: Thu, 12 Dec 2019 15:16:26 +0200 Subject: [PATCH 09/44] Merge pull request #352 from quantumblacklabs/release/0.15.5 Release 0.15.5 --- kedro/io/__init__.py | 2 + kedro/io/core.py | 108 +++++++++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 41 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index ba16c0651..0a4a76b70 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -44,10 +44,12 @@ from .excel_local import ExcelLocalDataSet # NOQA from .hdf_local import HDFLocalDataSet # NOQA from .hdf_s3 import HDFS3DataSet # NOQA +from .json_dataset import JSONDataSet # NOQA from .json_local import JSONLocalDataSet # NOQA from .lambda_data_set import LambdaDataSet # NOQA from .memory_data_set import MemoryDataSet # NOQA from .parquet_local import ParquetLocalDataSet # NOQA +from .partitioned_data_set import PartitionedDataSet # NOQA from .pickle_local import PickleLocalDataSet # NOQA from .pickle_s3 import PickleS3DataSet # NOQA from .sql import SQLQueryDataSet # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py index 996945d0a..f71d65e94 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -38,7 +38,7 @@ from datetime import datetime, timezone from glob import iglob from pathlib import Path, PurePath -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type from urllib.parse import urlparse from warnings import warn @@ -138,49 +138,18 @@ def from_config( from its config. """ - config = copy.deepcopy(config) - save_version = save_version or generate_timestamp() - - if VERSION_KEY in config: - # remove "version" key so that it's not passed - # to the 'unversioned' data set constructor - message = ( - "`%s` attribute removed from `%s` data set " - "configuration since it is a reserved word and cannot " - "be directly specified", - VERSION_KEY, - name, - ) - logging.getLogger(__name__).warning(*message) - del config[VERSION_KEY] - if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned - config[VERSION_KEY] = Version(load_version, save_version) - - dataset_class_path = config.pop("type") try: - class_obj = load_obj(dataset_class_path, "kedro.io") - except ImportError: - raise DataSetError( - "Cannot import module when trying to load type " - "`{}` for DataSet `{}`.".format(dataset_class_path, name) + class_obj, config = parse_dataset_definition( + config, load_version, save_version ) - except AttributeError: + except Exception as ex: raise DataSetError( - "Class `{}` for DataSet `{}` not found.".format( - dataset_class_path, name - ) + "An exception occurred when parsing config " + "for DataSet `{}`:\n{}".format(name, str(ex)) ) - if not issubclass(class_obj, AbstractDataSet): - raise DataSetError( - "DataSet '{}' type `{}.{}` is invalid: " - "all data set types must extend " - "`AbstractDataSet`.".format( - name, class_obj.__module__, class_obj.__qualname__ - ) - ) try: - data_set = class_obj(**config) + data_set = class_obj(**config) # type: ignore except TypeError as err: raise DataSetError( "\n{}.\nDataSet '{}' must only contain " @@ -392,15 +361,72 @@ class Version(namedtuple("Version", ["load", "save"])): ) +def parse_dataset_definition( + config: Dict[str, Any], load_version: str = None, save_version: str = None +) -> Tuple[Type[AbstractDataSet], Dict]: + """Parse and instantiate a dataset class using the configuration provided. + + Args: + config: Data set config dictionary. It *must* contain the `type` key + with fully qualified class name. + load_version: Version string to be used for ``load`` operation if + the data set is versioned. Has no effect on the data set + if versioning was not enabled. + save_version: Version string to be used for ``save`` operation if + the data set is versioned. Has no effect on the data set + if versioning was not enabled. + + Raises: + DataSetError: If the function fails to parse the configuration provided. + + Returns: + 2-tuple: (Dataset class object, configuration dictionary) + """ + save_version = save_version or generate_timestamp() + config = copy.deepcopy(config) + + if "type" not in config: + raise DataSetError("`type` is missing from DataSet catalog configuration") + + class_obj = config.pop("type") + + if isinstance(class_obj, str): + try: + class_obj = load_obj(class_obj, "kedro.io") + except ImportError: + raise DataSetError( + "Cannot import module when trying to load type `{}`.".format(class_obj) + ) + except AttributeError: + raise DataSetError("Class `{}` not found.".format(class_obj)) + if not issubclass(class_obj, AbstractDataSet): + raise DataSetError( + "DataSet type `{}.{}` is invalid: all data set types must extend " + "`AbstractDataSet`.".format(class_obj.__module__, class_obj.__qualname__) + ) + + if VERSION_KEY in config: + # remove "version" key so that it's not passed + # to the "unversioned" data set constructor + message = ( + "`%s` attribute removed from data set configuration since it is a " + "reserved word and cannot be directly specified" + ) + logging.getLogger(__name__).warning(message, VERSION_KEY) + del config[VERSION_KEY] + if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned + config[VERSION_KEY] = Version(load_version, save_version) + + return class_obj, config + + def _local_exists(filepath: str) -> bool: filepath = Path(filepath) return filepath.exists() or any(par.is_file() for par in filepath.parents) def is_remote_path(filepath: str) -> bool: - """ - Check if the given path looks like a remote URL (has scheme). - """ + """Check if the given path looks like a remote URL (has scheme).""" # Get rid of Windows-specific "C:\" start, # which is treated as a URL scheme. _, filepath = os.path.splitdrive(filepath) From 5d2984083b331c94f5bad0b9c774a193afded763 Mon Sep 17 00:00:00 2001 From: Lim H Date: Wed, 26 Feb 2020 11:43:52 +0000 Subject: [PATCH 10/44] Merge pull request #455 from quantumblacklabs/release/0.15.6 Release/0.15.6 --- kedro/io/__init__.py | 5 +- kedro/io/core.py | 113 ++++++++++++++++++++++++++++++++++++++----- kedro/utils.py | 2 +- 3 files changed, 106 insertions(+), 14 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 0a4a76b70..9a3a61f90 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# Copyright 2020 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ which allows implementation of various ``AbstractDataSet``s. """ +from .cached_dataset import CachedDataSet # NOQA from .core import AbstractDataSet # NOQA from .core import AbstractVersionedDataSet # NOQA from .core import DataSetAlreadyExistsError # NOQA @@ -41,6 +42,7 @@ from .csv_local import CSVLocalDataSet # NOQA from .csv_s3 import CSVS3DataSet # NOQA from .data_catalog import DataCatalog # NOQA +from .data_catalog_with_default import DataCatalogWithDefault # NOQA from .excel_local import ExcelLocalDataSet # NOQA from .hdf_local import HDFLocalDataSet # NOQA from .hdf_s3 import HDFS3DataSet # NOQA @@ -49,6 +51,7 @@ from .lambda_data_set import LambdaDataSet # NOQA from .memory_data_set import MemoryDataSet # NOQA from .parquet_local import ParquetLocalDataSet # NOQA +from .partitioned_data_set import IncrementalDataSet # NOQA from .partitioned_data_set import PartitionedDataSet # NOQA from .pickle_local import PickleLocalDataSet # NOQA from .pickle_s3 import PickleS3DataSet # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py index f71d65e94..5b9c0fe7b 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# Copyright 2020 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,18 +34,24 @@ import copy import logging import os +import warnings from collections import namedtuple from datetime import datetime, timezone from glob import iglob from pathlib import Path, PurePath from typing import Any, Callable, Dict, List, Optional, Tuple, Type from urllib.parse import urlparse -from warnings import warn + +from fsspec.utils import infer_storage_options from kedro.utils import load_obj +warnings.simplefilter("default", DeprecationWarning) + VERSIONED_FLAG_KEY = "versioned" VERSION_KEY = "version" +HTTP_PROTOCOLS = ("http", "https") +PROTOCOL_DELIMITER = "://" class DataSetError(Exception): @@ -327,6 +333,12 @@ def release(self) -> None: def _release(self) -> None: pass + def _copy(self, **overwrite_params) -> "AbstractDataSet": + dataset_copy = copy.deepcopy(self) + for name, value in overwrite_params.items(): + setattr(dataset_copy, name, value) + return dataset_copy + def generate_timestamp() -> str: """Generate the timestamp to be used by versioning. @@ -353,17 +365,19 @@ class Version(namedtuple("Version", ["load", "save"])): __slots__ = () -CONSISTENCY_WARNING = ( +_CONSISTENCY_WARNING = ( "Save version `{}` did not match load version `{}` for {}. This is strongly " "discouraged due to inconsistencies it may cause between `save` and " "`load` operations. Please refrain from setting exact load version for " "intermediate data sets where possible to avoid this warning." ) +_DEFAULT_PACKAGES = ["kedro.io.", "kedro.extras.datasets.", ""] + def parse_dataset_definition( config: Dict[str, Any], load_version: str = None, save_version: str = None -) -> Tuple[Type[AbstractDataSet], Dict]: +) -> Tuple[Type[AbstractDataSet], Dict[str, Any]]: """Parse and instantiate a dataset class using the configuration provided. Args: @@ -389,16 +403,20 @@ def parse_dataset_definition( raise DataSetError("`type` is missing from DataSet catalog configuration") class_obj = config.pop("type") - if isinstance(class_obj, str): - try: - class_obj = load_obj(class_obj, "kedro.io") - except ImportError: + if len(class_obj.strip(".")) != len(class_obj): raise DataSetError( - "Cannot import module when trying to load type `{}`.".format(class_obj) + "`type` class path does not support relative " + "paths or paths ending with a dot." ) - except AttributeError: + + class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES) + trials = (_load_obj(class_path) for class_path in class_paths) + try: + class_obj = next(obj for obj in trials if obj is not None) + except StopIteration: raise DataSetError("Class `{}` not found.".format(class_obj)) + if not issubclass(class_obj, AbstractDataSet): raise DataSetError( "DataSet type `{}.{}` is invalid: all data set types must extend " @@ -420,6 +438,14 @@ def parse_dataset_definition( return class_obj, config +def _load_obj(class_path: str) -> Optional[object]: + try: + class_obj = load_obj(class_path) + except (ImportError, AttributeError, ValueError): + return None + return class_obj + + def _local_exists(filepath: str) -> bool: filepath = Path(filepath) return filepath.exists() or any(par.is_file() for par in filepath.parents) @@ -565,8 +591,8 @@ def save(self, data: Any) -> None: load_version = self._lookup_load_version() if load_version != self._last_save_version: - warn( - CONSISTENCY_WARNING.format( + warnings.warn( + _CONSISTENCY_WARNING.format( self._last_save_version, load_version, str(self) ) ) @@ -592,3 +618,66 @@ def exists(self) -> bool: str(self), str(exc) ) raise DataSetError(message) from exc + + +def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, str]: + """Parses filepath on protocol and path. + + Args: + filepath: raw filepath e.g.: `gcs://bucket/test.json`. + version: instance of ``kedro.io.core.Version`` or None. + + Returns: + Protocol and path. + + Raises: + DataSetError: when protocol is http(s) and version is not None. + Note: HTTP(s) dataset doesn't support versioning. + """ + options_dict = infer_storage_options(filepath) + path = options_dict["path"] + protocol = options_dict["protocol"] + + if protocol in HTTP_PROTOCOLS: + if version: + raise DataSetError( + "HTTP(s) DataSet doesn't support versioning. " + "Please remove version flag from the dataset configuration." + ) + path = path.split(PROTOCOL_DELIMITER, 1)[-1] + + return protocol, path + + +def get_filepath_str(path: PurePath, protocol: str) -> str: + """Returns filepath. Returns full filepath (with protocol) if protocol is HTTP(s). + + Args: + path: filepath without protocol. + protocol: protocol. + + Returns: + Filepath string. + """ + path = str(path) + if protocol in HTTP_PROTOCOLS: + path = "".join((protocol, PROTOCOL_DELIMITER, path)) + return path + + +def validate_on_forbidden_chars(**kwargs): + """Validate that string values do not include white-spaces or ;""" + for key, value in kwargs.items(): + if " " in value or ";" in value: + raise DataSetError( + "Neither white-space nor semicolon are allowed in `{}`.".format(key) + ) + + +def deprecation_warning(class_name): + """Log deprecation warning.""" + warnings.warn( + "{} will be deprecated in future releases. Please refer " + "to replacement datasets in kedro.extras.datasets.".format(class_name), + DeprecationWarning, + ) diff --git a/kedro/utils.py b/kedro/utils.py index a4e0d07a3..832818fba 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -1,4 +1,4 @@ -# Copyright 2018-2019 QuantumBlack Visual Analytics Limited +# Copyright 2020 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9ff268e99d021427fdfed6fa8e428d6a4ab7fbf2 Mon Sep 17 00:00:00 2001 From: "Kiyohito Kunii (Kiyo)" <8097799+921kiyo@users.noreply.github.com> Date: Thu, 5 Mar 2020 09:57:04 +0000 Subject: [PATCH 11/44] Merge pull request #477 from quantumblacklabs/hotfix/0.15.8 Hotfix release 0.15.8 --- kedro/io/core.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 5b9c0fe7b..9f38bab3d 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -411,6 +411,7 @@ def parse_dataset_definition( ) class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES) + trials = (_load_obj(class_path) for class_path in class_paths) try: class_obj = next(obj for obj in trials if obj is not None) @@ -441,8 +442,14 @@ def parse_dataset_definition( def _load_obj(class_path: str) -> Optional[object]: try: class_obj = load_obj(class_path) - except (ImportError, AttributeError, ValueError): + except ImportError as error: + if error.name in class_path: + return None + # class_obj was successfully loaded, but some dependencies are missing. + raise DataSetError("{} for {}".format(error, class_path)) + except (AttributeError, ValueError): return None + return class_obj From 0c9afa063841830bb44f5cc8a367714c8256e166 Mon Sep 17 00:00:00 2001 From: Lim H Date: Wed, 20 May 2020 11:50:48 +0100 Subject: [PATCH 12/44] Merge pull request #607 from quantumblacklabs/release/0.16.0 Bump version to 0.16.0 --- kedro/io/__init__.py | 18 +--- kedro/io/core.py | 245 +++++++++++++++++++++++++++---------------- kedro/utils.py | 4 +- 3 files changed, 156 insertions(+), 111 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 9a3a61f90..4219a158f 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES @@ -19,7 +19,7 @@ # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause +# or use the QuantumBlack Trademarks in any other manner that might cause # confusion in the marketplace, including but not limited to in advertising, # on websites, or on software. # @@ -38,24 +38,10 @@ from .core import DataSetError # NOQA from .core import DataSetNotFoundError # NOQA from .core import Version # NOQA -from .csv_http import CSVHTTPDataSet # NOQA -from .csv_local import CSVLocalDataSet # NOQA -from .csv_s3 import CSVS3DataSet # NOQA from .data_catalog import DataCatalog # NOQA from .data_catalog_with_default import DataCatalogWithDefault # NOQA -from .excel_local import ExcelLocalDataSet # NOQA -from .hdf_local import HDFLocalDataSet # NOQA -from .hdf_s3 import HDFS3DataSet # NOQA -from .json_dataset import JSONDataSet # NOQA -from .json_local import JSONLocalDataSet # NOQA from .lambda_data_set import LambdaDataSet # NOQA from .memory_data_set import MemoryDataSet # NOQA -from .parquet_local import ParquetLocalDataSet # NOQA from .partitioned_data_set import IncrementalDataSet # NOQA from .partitioned_data_set import PartitionedDataSet # NOQA -from .pickle_local import PickleLocalDataSet # NOQA -from .pickle_s3 import PickleS3DataSet # NOQA -from .sql import SQLQueryDataSet # NOQA -from .sql import SQLTableDataSet # NOQA -from .text_local import TextLocalDataSet # NOQA from .transformers import AbstractTransformer # NOQA diff --git a/kedro/io/core.py b/kedro/io/core.py index 9f38bab3d..f03f70f7c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES @@ -19,7 +19,7 @@ # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause +# or use the QuantumBlack Trademarks in any other manner that might cause # confusion in the marketplace, including but not limited to in advertising, # on websites, or on software. # @@ -33,25 +33,30 @@ import abc import copy import logging -import os +import re import warnings from collections import namedtuple from datetime import datetime, timezone +from functools import partial from glob import iglob +from operator import attrgetter from pathlib import Path, PurePath from typing import Any, Callable, Dict, List, Optional, Tuple, Type -from urllib.parse import urlparse +from urllib.parse import urlsplit -from fsspec.utils import infer_storage_options +from cachetools import Cache, cachedmethod +from cachetools.keys import hashkey from kedro.utils import load_obj warnings.simplefilter("default", DeprecationWarning) +VERSION_FORMAT = "%Y-%m-%dT%H.%M.%S.%fZ" VERSIONED_FLAG_KEY = "versioned" VERSION_KEY = "version" HTTP_PROTOCOLS = ("http", "https") PROTOCOL_DELIMITER = "://" +CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs") class DataSetError(Exception): @@ -97,23 +102,37 @@ class AbstractDataSet(abc.ABC): Example: :: - >>> from kedro.io import AbstractDataSet + >>> from pathlib import Path, PurePosixPath >>> import pandas as pd + >>> from kedro.io import AbstractDataSet + >>> >>> >>> class MyOwnDataSet(AbstractDataSet): - >>> def __init__(self, param1, param2): + >>> def __init__(self, filepath, param1, param2=True): + >>> self._filepath = PurePosixPath(filepath) >>> self._param1 = param1 >>> self._param2 = param2 >>> >>> def _load(self) -> pd.DataFrame: - >>> print("Dummy load: {}".format(self._param1)) - >>> return pd.DataFrame() + >>> return pd.read_csv(self._filepath) >>> >>> def _save(self, df: pd.DataFrame) -> None: - >>> print("Dummy save: {}".format(self._param2)) + >>> df.to_csv(str(self._filepath)) + >>> + >>> def _exists(self) -> bool: + >>> return Path(self._filepath).exists() >>> >>> def _describe(self): >>> return dict(param1=self._param1, param2=self._param2) + + Example catalog.yml specification: + :: + + my_dataset: + type: .MyOwnDataSet + filepath: data/01_raw/my_data.csv + param1: # param1 is a required argument + # param2 will be True by default """ @classmethod @@ -177,12 +196,6 @@ def from_config( def _logger(self) -> logging.Logger: return logging.getLogger(__name__) - def get_last_load_version(self) -> Optional[str]: - """Versioned datasets should override this property to return last loaded - version""" - # pylint: disable=no-self-use - return None # pragma: no cover - def load(self) -> Any: """Loads data by delegation to the provided load method. @@ -208,12 +221,6 @@ def load(self) -> Any: ) raise DataSetError(message) from exc - def get_last_save_version(self) -> Optional[str]: - """Versioned datasets should override this property to return last saved - version.""" - # pylint: disable=no-self-use - return None # pragma: no cover - def save(self, data: Any) -> None: """Saves data by delegation to the provided save method. @@ -318,7 +325,7 @@ def release(self) -> None: """Release any cached data. Raises: - DataSetError: when underlying exists method raises error. + DataSetError: when underlying release method raises error. """ try: @@ -347,12 +354,8 @@ def generate_timestamp() -> str: String representation of the current timestamp. """ - current_ts = datetime.now(tz=timezone.utc) - fmt = ( - "{d.year:04d}-{d.month:02d}-{d.day:02d}T{d.hour:02d}" - ".{d.minute:02d}.{d.second:02d}.{ms:03d}Z" - ) - return fmt.format(d=current_ts, ms=current_ts.microsecond // 1000) + current_ts = datetime.now(tz=timezone.utc).strftime(VERSION_FORMAT) + return current_ts[:-4] + current_ts[-1:] # Don't keep microseconds class Version(namedtuple("Version", ["load", "save"])): @@ -440,32 +443,38 @@ def parse_dataset_definition( def _load_obj(class_path: str) -> Optional[object]: + mod_path, _, class_name = class_path.rpartition(".") + try: + available_classes = load_obj(f"{mod_path}.__all__") + # ModuleNotFoundError: When `load_obj` can't find `mod_path` (e.g `kedro.io.pandas`) + # this is because we try a combination of all prefixes. + # AttributeError: When `load_obj` manages to load `mod_path` but it doesn't have an + # `__all__` attribute -- either because it's a custom or a kedro.io dataset + except (ModuleNotFoundError, AttributeError, ValueError): + available_classes = None + try: class_obj = load_obj(class_path) - except ImportError as error: - if error.name in class_path: - return None - # class_obj was successfully loaded, but some dependencies are missing. - raise DataSetError("{} for {}".format(error, class_path)) - except (AttributeError, ValueError): + except (ModuleNotFoundError, ValueError): + return None + except AttributeError as error: + if available_classes and class_name in available_classes: + raise DataSetError( + f"{error} Please see the documentation on how to " + f"install relevant dependencies for {class_path}:\n" + f"https://kedro.readthedocs.io/en/stable/02_getting_started/" + f"02_install.html#optional-dependencies" + ) return None return class_obj -def _local_exists(filepath: str) -> bool: +def _local_exists(filepath: str) -> bool: # SKIP_IF_NO_SPARK filepath = Path(filepath) return filepath.exists() or any(par.is_file() for par in filepath.parents) -def is_remote_path(filepath: str) -> bool: - """Check if the given path looks like a remote URL (has scheme).""" - # Get rid of Windows-specific "C:\" start, - # which is treated as a URL scheme. - _, filepath = os.path.splitdrive(filepath) - return bool(urlparse(filepath).scheme) - - class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): """ ``AbstractVersionedDataSet`` is the base class for all versioned data set @@ -475,13 +484,14 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): Example: :: - >>> from kedro.io import AbstractVersionedDataSet + >>> from pathlib import Path, PurePosixPath >>> import pandas as pd + >>> from kedro.io import AbstractVersionedDataSet >>> >>> >>> class MyOwnDataSet(AbstractVersionedDataSet): - >>> def __init__(self, param1, param2, filepath, version): - >>> super().__init__(filepath, version) + >>> def __init__(self, filepath, version, param1, param2=True): + >>> super().__init__(PurePosixPath(filepath), version) >>> self._param1 = param1 >>> self._param2 = param2 >>> @@ -495,13 +505,21 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> >>> def _exists(self) -> bool: >>> path = self._get_load_path() - >>> return path.is_file() + >>> return Path(path).exists() >>> >>> def _describe(self): >>> return dict(version=self._version, param1=self._param1, param2=self._param2) - """ - # pylint: disable=abstract-method + Example catalog.yml specification: + :: + + my_dataset: + type: .MyOwnDataSet + filepath: data/01_raw/my_data.csv + versioned: true + param1: # param1 is a required argument + # param2 will be True by default + """ def __init__( self, @@ -527,20 +545,15 @@ def __init__( self._version = version self._exists_function = exists_function or _local_exists self._glob_function = glob_function or iglob - self._last_load_version = None # type: Optional[str] - self._last_save_version = None # type: Optional[str] - - def get_last_load_version(self) -> Optional[str]: - return self._last_load_version - - def _lookup_load_version(self) -> Optional[str]: - if not self._version: - return None - if self._version.load: - return self._version.load + # 1 entry for load version, 1 for save version + self._version_cache = Cache(maxsize=2) + # 'key' is set to prevent cache key overlapping for load and save: + # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod + @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "load")) + def _fetch_latest_load_version(self) -> str: # When load version is unpinned, fetch the most recent existing - # version from the given path + # version from the given path. pattern = str(self._get_versioned_path("*")) version_paths = sorted(self._glob_function(pattern), reverse=True) most_recent = next( @@ -548,35 +561,49 @@ def _lookup_load_version(self) -> Optional[str]: ) if not most_recent: - raise VersionNotFoundError( - "Did not find any versions for {}".format(str(self)) - ) + raise VersionNotFoundError(f"Did not find any versions for {self}") return PurePath(most_recent).parent.name + # 'key' is set to prevent cache key overlapping for load and save: + # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod + @cachedmethod(cache=attrgetter("_version_cache"), key=partial(hashkey, "save")) + def _fetch_latest_save_version(self) -> str: # pylint: disable=no-self-use + """Generate and cache the current save version""" + return generate_timestamp() + + def resolve_load_version(self) -> Optional[str]: + """Compute the version the dataset should be loaded with.""" + if not self._version: + return None + if self._version.load: + return self._version.load + return self._fetch_latest_load_version() + def _get_load_path(self) -> PurePath: if not self._version: # When versioning is disabled, load from original filepath return self._filepath - load_version = self._last_load_version or self._lookup_load_version() + load_version = self.resolve_load_version() return self._get_versioned_path(load_version) # type: ignore - def get_last_save_version(self) -> Optional[str]: - return self._last_save_version - - def _lookup_save_version(self) -> Optional[str]: + def resolve_save_version(self) -> Optional[str]: + """Compute the version the dataset should be saved with.""" if not self._version: return None - return self._version.save or generate_timestamp() + if self._version.save: + return self._version.save + return self._fetch_latest_save_version() def _get_save_path(self) -> PurePath: if not self._version: # When versioning is disabled, return original filepath return self._filepath - save_version = self._last_save_version or self._lookup_save_version() + save_version = self.resolve_save_version() versioned_path = self._get_versioned_path(save_version) # type: ignore + if self._exists_function(str(versioned_path)): raise DataSetError( "Save path `{}` for {} must not exist if versioning " @@ -589,19 +616,18 @@ def _get_versioned_path(self, version: str) -> PurePath: return self._filepath / version / self._filepath.name def load(self) -> Any: - self._last_load_version = self._lookup_load_version() + self.resolve_load_version() # Make sure last load version is set return super().load() def save(self, data: Any) -> None: - self._last_save_version = self._lookup_save_version() + self._version_cache.clear() + save_version = self.resolve_save_version() # Make sure last save version is set super().save(data) - load_version = self._lookup_load_version() - if load_version != self._last_save_version: + load_version = self.resolve_load_version() + if load_version != save_version: warnings.warn( - _CONSISTENCY_WARNING.format( - self._last_save_version, load_version, str(self) - ) + _CONSISTENCY_WARNING.format(save_version, load_version, str(self)) ) def exists(self) -> bool: @@ -620,12 +646,54 @@ def exists(self) -> bool: return self._exists() except VersionNotFoundError: return False - except Exception as exc: + except Exception as exc: # SKIP_IF_NO_SPARK message = "Failed during exists check for data set {}.\n{}".format( str(self), str(exc) ) raise DataSetError(message) from exc + def _release(self) -> None: + super()._release() + self._version_cache.clear() + + +def _parse_filepath(filepath: str) -> Dict[str, str]: + """Split filepath on protocol and path. Based on `fsspec.utils.infer_storage_options`. + + Args: + filepath: Either local absolute file path or URL (s3://bucket/file.csv) + + Returns: + Parsed filepath. + """ + if ( + re.match(r"^[a-zA-Z]:[\\/]", filepath) + or re.match(r"^[a-zA-Z0-9]+://", filepath) is None + ): + return {"protocol": "file", "path": filepath} + + parsed_path = urlsplit(filepath) + protocol = parsed_path.scheme or "file" + + if protocol in HTTP_PROTOCOLS: + return {"protocol": protocol, "path": filepath} + + path = parsed_path.path + if protocol == "file": + windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) + if windows_path: + path = "{}:{}".format(*windows_path.groups()) + + options = {"protocol": protocol, "path": path} + + if parsed_path.netloc: + if protocol in CLOUD_PROTOCOLS: + host_with_port = parsed_path.netloc.rsplit("@", 1)[-1] + host = host_with_port.rsplit(":", 1)[0] + options["path"] = host + options["path"] + + return options + def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, str]: """Parses filepath on protocol and path. @@ -635,13 +703,13 @@ def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, version: instance of ``kedro.io.core.Version`` or None. Returns: - Protocol and path. + Protocol and path. Raises: - DataSetError: when protocol is http(s) and version is not None. - Note: HTTP(s) dataset doesn't support versioning. + DataSetError: when protocol is http(s) and version is not None. + Note: HTTP(s) dataset doesn't support versioning. """ - options_dict = infer_storage_options(filepath) + options_dict = _parse_filepath(filepath) path = options_dict["path"] protocol = options_dict["protocol"] @@ -679,12 +747,3 @@ def validate_on_forbidden_chars(**kwargs): raise DataSetError( "Neither white-space nor semicolon are allowed in `{}`.".format(key) ) - - -def deprecation_warning(class_name): - """Log deprecation warning.""" - warnings.warn( - "{} will be deprecated in future releases. Please refer " - "to replacement datasets in kedro.extras.datasets.".format(class_name), - DeprecationWarning, - ) diff --git a/kedro/utils.py b/kedro/utils.py index 832818fba..ed449ac25 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES @@ -19,7 +19,7 @@ # trademarks of QuantumBlack. The License does not grant you any right or # license to the QuantumBlack Trademarks. You may not use the QuantumBlack # Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause +# or use the QuantumBlack Trademarks in any other manner that might cause # confusion in the marketplace, including but not limited to in advertising, # on websites, or on software. # From 7c9e434e116e83f23a1cb468c53d7ba08419bb6e Mon Sep 17 00:00:00 2001 From: Dmitrii Deriabin <44967953+DmitriiDeriabinQB@users.noreply.github.com> Date: Tue, 23 Jun 2020 19:19:10 +0100 Subject: [PATCH 13/44] Fix DataSet string representation for falsy values (#418) --- kedro/io/core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f03f70f7c..c5da014ca 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -252,7 +252,7 @@ def _to_str(obj, is_root=False): 1. The root level (i.e. the DataSet.__init__ arguments) are formatted like DataSet(key=value). 2. Dictionaries have the keys alphabetically sorted recursively. - 3. Empty dictionaries and None values are not shown. + 3. None values are not shown. """ fmt = "{}={}" if is_root else "'{}': {}" # 1 @@ -263,8 +263,8 @@ def _to_str(obj, is_root=False): text = ", ".join( fmt.format(key, _to_str(value)) # 2 for key, value in sorted_dict - if value or isinstance(value, bool) - ) # 3 + if value is not None # 3 + ) return text if is_root else "{" + text + "}" # 1 From 5eca5129e2be55a318d00f76edaf28efe56f61e8 Mon Sep 17 00:00:00 2001 From: Andrii Ivaniuk Date: Fri, 26 Jun 2020 15:10:29 +0300 Subject: [PATCH 14/44] Fixed versioning on Windows (#673) --- kedro/io/core.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c5da014ca..6a9f3a7ff 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -40,7 +40,7 @@ from functools import partial from glob import iglob from operator import attrgetter -from pathlib import Path, PurePath +from pathlib import Path, PurePath, PurePosixPath from typing import Any, Callable, Dict, List, Optional, Tuple, Type from urllib.parse import urlsplit @@ -120,7 +120,7 @@ class AbstractDataSet(abc.ABC): >>> df.to_csv(str(self._filepath)) >>> >>> def _exists(self) -> bool: - >>> return Path(self._filepath).exists() + >>> return Path(self._filepath.as_posix()).exists() >>> >>> def _describe(self): >>> return dict(param1=self._param1, param2=self._param2) @@ -505,7 +505,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): >>> >>> def _exists(self) -> bool: >>> path = self._get_load_path() - >>> return Path(path).exists() + >>> return Path(path.as_posix()).exists() >>> >>> def _describe(self): >>> return dict(version=self._version, param1=self._param1, param2=self._param2) @@ -523,7 +523,7 @@ class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): def __init__( self, - filepath: PurePath, + filepath: PurePosixPath, version: Optional[Version], exists_function: Callable[[str], bool] = None, glob_function: Callable[[str], List[str]] = None, @@ -531,7 +531,7 @@ def __init__( """Creates a new instance of ``AbstractVersionedDataSet``. Args: - filepath: Path to file. + filepath: Filepath in POSIX format to a file. version: If specified, should be an instance of ``kedro.io.core.Version``. If its ``load`` attribute is None, the latest version will be loaded. If its ``save`` @@ -580,7 +580,7 @@ def resolve_load_version(self) -> Optional[str]: return self._version.load return self._fetch_latest_load_version() - def _get_load_path(self) -> PurePath: + def _get_load_path(self) -> PurePosixPath: if not self._version: # When versioning is disabled, load from original filepath return self._filepath @@ -596,7 +596,7 @@ def resolve_save_version(self) -> Optional[str]: return self._version.save return self._fetch_latest_save_version() - def _get_save_path(self) -> PurePath: + def _get_save_path(self) -> PurePosixPath: if not self._version: # When versioning is disabled, return original filepath return self._filepath @@ -612,7 +612,7 @@ def _get_save_path(self) -> PurePath: return versioned_path - def _get_versioned_path(self, version: str) -> PurePath: + def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name def load(self) -> Any: @@ -734,7 +734,7 @@ def get_filepath_str(path: PurePath, protocol: str) -> str: Returns: Filepath string. """ - path = str(path) + path = path.as_posix() if protocol in HTTP_PROTOCOLS: path = "".join((protocol, PROTOCOL_DELIMITER, path)) return path From ee95b11f55842b87ea8eacbfaef5d4665e2602c4 Mon Sep 17 00:00:00 2001 From: Jesaja Everling Date: Mon, 3 Aug 2020 10:35:39 +0200 Subject: [PATCH 15/44] Fix broken links to documentation (#473) --- kedro/io/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 6a9f3a7ff..092e7828e 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -462,8 +462,8 @@ def _load_obj(class_path: str) -> Optional[object]: raise DataSetError( f"{error} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" - f"https://kedro.readthedocs.io/en/stable/02_getting_started/" - f"02_install.html#optional-dependencies" + f"https://kedro.readthedocs.io/en/stable/" + f"04_kedro_project_setup/01_dependencies.html" ) return None From cd72df9ea510c805cde693405e31509f55e6c38a Mon Sep 17 00:00:00 2001 From: "Kiyohito Kunii (Kiyo)" <8097799+921kiyo@users.noreply.github.com> Date: Mon, 24 Aug 2020 16:06:25 +0100 Subject: [PATCH 16/44] Make Pylint 2.6.0 happy again (#765) --- kedro/io/core.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 092e7828e..6ec331d81 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -167,11 +167,11 @@ def from_config( class_obj, config = parse_dataset_definition( config, load_version, save_version ) - except Exception as ex: + except Exception as exc: raise DataSetError( "An exception occurred when parsing config " - "for DataSet `{}`:\n{}".format(name, str(ex)) - ) + "for DataSet `{}`:\n{}".format(name, str(exc)) + ) from exc try: data_set = class_obj(**config) # type: ignore @@ -182,14 +182,14 @@ def from_config( "of `{}.{}`.".format( str(err), name, class_obj.__module__, class_obj.__qualname__ ) - ) + ) from err except Exception as err: raise DataSetError( "\n{}.\nFailed to instantiate DataSet " "'{}' of type `{}.{}`.".format( str(err), name, class_obj.__module__, class_obj.__qualname__ ) - ) + ) from err return data_set @property @@ -418,8 +418,8 @@ def parse_dataset_definition( trials = (_load_obj(class_path) for class_path in class_paths) try: class_obj = next(obj for obj in trials if obj is not None) - except StopIteration: - raise DataSetError("Class `{}` not found.".format(class_obj)) + except StopIteration as exc: + raise DataSetError("Class `{}` not found.".format(class_obj)) from exc if not issubclass(class_obj, AbstractDataSet): raise DataSetError( @@ -457,14 +457,14 @@ def _load_obj(class_path: str) -> Optional[object]: class_obj = load_obj(class_path) except (ModuleNotFoundError, ValueError): return None - except AttributeError as error: + except AttributeError as exc: if available_classes and class_name in available_classes: raise DataSetError( - f"{error} Please see the documentation on how to " + f"{exc} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" f"https://kedro.readthedocs.io/en/stable/" f"04_kedro_project_setup/01_dependencies.html" - ) + ) from exc return None return class_obj From 573977b049c7e73a141004479a88f56283159173 Mon Sep 17 00:00:00 2001 From: Waylon Walker Date: Fri, 20 Nov 2020 08:29:45 -0600 Subject: [PATCH 17/44] Convert all format strings to f-strings (#574) --- kedro/io/core.py | 14 +++++--------- kedro/utils.py | 4 +--- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 6ec331d81..72cb234dc 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -241,9 +241,7 @@ def save(self, data: Any) -> None: except DataSetError: raise except Exception as exc: - message = "Failed while saving data to data set {}.\n{}".format( - str(self), str(exc) - ) + message = f"Failed while saving data to data set {str(self)}.\n{str(exc)}" raise DataSetError(message) from exc def __str__(self): @@ -271,7 +269,7 @@ def _to_str(obj, is_root=False): # not a dictionary return str(obj) - return "{}({})".format(type(self).__name__, _to_str(self._describe(), True)) + return f"{type(self).__name__}({_to_str(self._describe(), True)})" @abc.abstractmethod def _load(self) -> Any: @@ -332,9 +330,7 @@ def release(self) -> None: self._logger.debug("Releasing %s", str(self)) self._release() except Exception as exc: - message = "Failed during release for data set {}.\n{}".format( - str(self), str(exc) - ) + message = f"Failed during release for data set {str(self)}.\n{str(exc)}" raise DataSetError(message) from exc def _release(self) -> None: @@ -419,7 +415,7 @@ def parse_dataset_definition( try: class_obj = next(obj for obj in trials if obj is not None) except StopIteration as exc: - raise DataSetError("Class `{}` not found.".format(class_obj)) from exc + raise DataSetError(f"Class `{class_obj}` not found.") from exc if not issubclass(class_obj, AbstractDataSet): raise DataSetError( @@ -745,5 +741,5 @@ def validate_on_forbidden_chars(**kwargs): for key, value in kwargs.items(): if " " in value or ";" in value: raise DataSetError( - "Neither white-space nor semicolon are allowed in `{}`.".format(key) + f"Neither white-space nor semicolon are allowed in `{key}`." ) diff --git a/kedro/utils.py b/kedro/utils.py index ed449ac25..e1fe21065 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -52,7 +52,5 @@ def load_obj(obj_path: str, default_obj_path: str = "") -> Any: obj_name = obj_path_list[0] module_obj = importlib.import_module(obj_path) if not hasattr(module_obj, obj_name): - raise AttributeError( - "Object `{}` cannot be loaded from `{}`.".format(obj_name, obj_path) - ) + raise AttributeError(f"Object `{obj_name}` cannot be loaded from `{obj_path}`.") return getattr(module_obj, obj_name) From fb4a328d1cd2fee1ec07789c58eaf10dbf45ee69 Mon Sep 17 00:00:00 2001 From: Lim Hoang Date: Thu, 17 Dec 2020 12:33:19 +0000 Subject: [PATCH 18/44] Merge pull request #923 from quantumblacklabs/develop Merge develop into master --- kedro/io/core.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 72cb234dc..e819c4b80 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -177,18 +177,13 @@ def from_config( data_set = class_obj(**config) # type: ignore except TypeError as err: raise DataSetError( - "\n{}.\nDataSet '{}' must only contain " - "arguments valid for the constructor " - "of `{}.{}`.".format( - str(err), name, class_obj.__module__, class_obj.__qualname__ - ) + f"\n{err}.\nDataSet '{name}' must only contain arguments valid for the " + f"constructor of `{class_obj.__module__}.{class_obj.__qualname__}`." ) from err except Exception as err: raise DataSetError( - "\n{}.\nFailed to instantiate DataSet " - "'{}' of type `{}.{}`.".format( - str(err), name, class_obj.__module__, class_obj.__qualname__ - ) + f"\n{err}.\nFailed to instantiate DataSet '{name}' " + f"of type `{class_obj.__module__}.{class_obj.__qualname__}`." ) from err return data_set @@ -419,8 +414,8 @@ def parse_dataset_definition( if not issubclass(class_obj, AbstractDataSet): raise DataSetError( - "DataSet type `{}.{}` is invalid: all data set types must extend " - "`AbstractDataSet`.".format(class_obj.__module__, class_obj.__qualname__) + f"DataSet type `{class_obj.__module__}.{class_obj.__qualname__}` " + f"is invalid: all data set types must extend `AbstractDataSet`." ) if VERSION_KEY in config: From 255a65c716cdbce8ab87df40abe8beafb6ce05f1 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Wed, 13 Jan 2021 16:31:41 +0000 Subject: [PATCH 19/44] Update Copyright date to 2021 (#941) --- kedro/io/__init__.py | 2 +- kedro/io/core.py | 2 +- kedro/utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 4219a158f..be3c51f38 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 QuantumBlack Visual Analytics Limited +# Copyright 2021 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/kedro/io/core.py b/kedro/io/core.py index e819c4b80..ff1e5b616 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1,4 +1,4 @@ -# Copyright 2020 QuantumBlack Visual Analytics Limited +# Copyright 2021 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/kedro/utils.py b/kedro/utils.py index e1fe21065..110afcc59 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -1,4 +1,4 @@ -# Copyright 2020 QuantumBlack Visual Analytics Limited +# Copyright 2021 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From e7367269d92d8d48cd5cef76ba86d0fd270dd8c9 Mon Sep 17 00:00:00 2001 From: Ignacio Paricio <54770971+ignacioparicio@users.noreply.github.com> Date: Mon, 7 Jun 2021 10:59:08 +0200 Subject: [PATCH 20/44] Upgrade project's `requirements.txt` #ked 2540 (#1126) --- kedro/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kedro/utils.py b/kedro/utils.py index 110afcc59..2411de2c4 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -36,15 +36,15 @@ def load_obj(obj_path: str, default_obj_path: str = "") -> Any: """Extract an object from a given path. - Args: - obj_path: Path to an object to be extracted, including the object name. - default_obj_path: Default object path. + Args: + obj_path: Path to an object to be extracted, including the object name. + default_obj_path: Default object path. - Returns: - Extracted object. + Returns: + Extracted object. - Raises: - AttributeError: When the object does not have the given named attribute. + Raises: + AttributeError: When the object does not have the given named attribute. """ obj_path_list = obj_path.rsplit(".", 1) From e5051aab38901ef3f68c4bb726603669ce152e18 Mon Sep 17 00:00:00 2001 From: Ignacio Paricio <54770971+ignacioparicio@users.noreply.github.com> Date: Fri, 11 Jun 2021 18:02:47 +0200 Subject: [PATCH 21/44] Improve error msg when versioning an existing dataset (#1144) --- kedro/io/core.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index ff1e5b616..88f4306be 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -224,7 +224,8 @@ def save(self, data: Any) -> None: Raises: DataSetError: when underlying save method raises error. - + FileNotFoundError: when save method got file instead of dir, on Windows. + NotADirectoryError: when save method got file instead of dir, on Unix. """ if data is None: @@ -235,6 +236,8 @@ def save(self, data: Any) -> None: self._save(data) except DataSetError: raise + except (FileNotFoundError, NotADirectoryError): + raise except Exception as exc: message = f"Failed while saving data to data set {str(self)}.\n{str(exc)}" raise DataSetError(message) from exc @@ -613,7 +616,22 @@ def load(self) -> Any: def save(self, data: Any) -> None: self._version_cache.clear() save_version = self.resolve_save_version() # Make sure last save version is set - super().save(data) + try: + super().save(data) + except (FileNotFoundError, NotADirectoryError) as err: + # FileNotFoundError raised in Win, NotADirectoryError raised in Unix + _default_version = "YYYY-MM-DDThh.mm.ss.sssZ" + raise DataSetError( + f"Cannot save versioned dataset `{self._filepath.name}` to " + f"`{self._filepath.parent.as_posix()}` because a file with the same " + f"name already exists in the directory. This is likely because " + f"versioning was enabled on a dataset already saved previously. Either " + f"remove `{self._filepath.name}` from the directory or manually " + f"convert it into a versioned dataset by placing it in a versioned " + f"directory (e.g. with default versioning format " + f"`{self._filepath.as_posix()}/{_default_version}/{self._filepath.name}" + f"`)." + ) from err load_version = self.resolve_load_version() if load_version != save_version: From 4d75d89c48935bdc84a57c12b0da667e7998df5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lorena=20B=C4=83lan?= Date: Mon, 2 Aug 2021 11:53:43 +0100 Subject: [PATCH 22/44] DataSetError 'parsing config' thrown when ModuleNotFound (#1201) --- kedro/io/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 88f4306be..e671dd73b 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -413,7 +413,10 @@ def parse_dataset_definition( try: class_obj = next(obj for obj in trials if obj is not None) except StopIteration as exc: - raise DataSetError(f"Class `{class_obj}` not found.") from exc + raise DataSetError( + f"Class `{class_obj}` not found or one of its dependencies" + f"has not been installed." + ) from exc if not issubclass(class_obj, AbstractDataSet): raise DataSetError( From 1fe598869c5ae587100fb25e6c0db83155b38811 Mon Sep 17 00:00:00 2001 From: Jiri Klein <44288863+jiriklein@users.noreply.github.com> Date: Tue, 3 Aug 2021 09:33:45 +0100 Subject: [PATCH 23/44] Update docstrings to mention the meaning of the _SINGLE_PROCESS flag in datasets (#1196) --- kedro/io/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index e671dd73b..e5fdcf1cc 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -98,7 +98,9 @@ class AbstractDataSet(abc.ABC): """``AbstractDataSet`` is the base class for all data set implementations. All data set implementations should extend this abstract class and implement the methods marked as abstract. - + If a specific dataset implementation cannot be used in conjunction with + the ``ParallelRunner``, such user-defined dataset should have the + attribute `_SINGLE_PROCESS = True`. Example: :: From 8b4f8b5217848a3f240c17d0930cba40ab20bab7 Mon Sep 17 00:00:00 2001 From: Waylon Walker Date: Thu, 9 Sep 2021 04:12:41 -0500 Subject: [PATCH 24/44] implement __all__ in __init__ modules (#874) --- kedro/io/__init__.py | 46 ++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index be3c51f38..dde54f72a 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -31,17 +31,35 @@ which allows implementation of various ``AbstractDataSet``s. """ -from .cached_dataset import CachedDataSet # NOQA -from .core import AbstractDataSet # NOQA -from .core import AbstractVersionedDataSet # NOQA -from .core import DataSetAlreadyExistsError # NOQA -from .core import DataSetError # NOQA -from .core import DataSetNotFoundError # NOQA -from .core import Version # NOQA -from .data_catalog import DataCatalog # NOQA -from .data_catalog_with_default import DataCatalogWithDefault # NOQA -from .lambda_data_set import LambdaDataSet # NOQA -from .memory_data_set import MemoryDataSet # NOQA -from .partitioned_data_set import IncrementalDataSet # NOQA -from .partitioned_data_set import PartitionedDataSet # NOQA -from .transformers import AbstractTransformer # NOQA +from .cached_dataset import CachedDataSet +from .core import ( + AbstractDataSet, + AbstractVersionedDataSet, + DataSetAlreadyExistsError, + DataSetError, + DataSetNotFoundError, + Version, +) +from .data_catalog import DataCatalog +from .data_catalog_with_default import DataCatalogWithDefault +from .lambda_data_set import LambdaDataSet +from .memory_data_set import MemoryDataSet +from .partitioned_data_set import IncrementalDataSet, PartitionedDataSet +from .transformers import AbstractTransformer + +__all__ = [ + "AbstractDataSet", + "AbstractTransformer", + "AbstractVersionedDataSet", + "CachedDataSet", + "DataCatalog", + "DataCatalogWithDefault", + "DataSetAlreadyExistsError", + "DataSetError", + "DataSetNotFoundError", + "IncrementalDataSet", + "LambdaDataSet", + "MemoryDataSet", + "PartitionedDataSet", + "Version", +] From 5de06ed3bf591af4f4acd132fff2afa7a5fedc00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lorena=20B=C4=83lan?= Date: Mon, 20 Sep 2021 15:03:18 +0100 Subject: [PATCH 25/44] Update release notes post release (#1240) --- kedro/io/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index e5fdcf1cc..a55e72731 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -171,8 +171,8 @@ def from_config( ) except Exception as exc: raise DataSetError( - "An exception occurred when parsing config " - "for DataSet `{}`:\n{}".format(name, str(exc)) + f"An exception occurred when parsing config " + f"for DataSet `{name}`:\n{str(exc)}" ) from exc try: From 468b97b37e3bd1fdc3b55db3e627dc16b779b7d3 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 22 Sep 2021 11:17:53 -0400 Subject: [PATCH 26/44] Use `pyupgrade` to replace format strings and more (#1242) --- kedro/io/core.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index a55e72731..814712b53 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -213,8 +213,8 @@ def load(self) -> Any: except Exception as exc: # This exception handling is by design as the composed data sets # can throw any type of exception. - message = "Failed while loading data from data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed while loading data from data set {str(self)}.\n{str(exc)}" ) raise DataSetError(message) from exc @@ -274,22 +274,22 @@ def _to_str(obj, is_root=False): @abc.abstractmethod def _load(self) -> Any: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_load` method".format(self.__class__.__name__) + f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " + f"it must implement the `_load` method" ) @abc.abstractmethod def _save(self, data: Any) -> None: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_save` method".format(self.__class__.__name__) + f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " + f"it must implement the `_save` method" ) @abc.abstractmethod def _describe(self) -> Dict[str, Any]: raise NotImplementedError( - "`{}` is a subclass of AbstractDataSet and" - "it must implement the `_describe` method".format(self.__class__.__name__) + f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " + f"it must implement the `_describe` method" ) def exists(self) -> bool: @@ -307,8 +307,8 @@ def exists(self) -> bool: self._logger.debug("Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: - message = "Failed during exists check for data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed during exists check for data set {str(self)}.\n{str(exc)}" ) raise DataSetError(message) from exc @@ -605,8 +605,8 @@ def _get_save_path(self) -> PurePosixPath: if self._exists_function(str(versioned_path)): raise DataSetError( - "Save path `{}` for {} must not exist if versioning " - "is enabled.".format(versioned_path, str(self)) + f"Save path `{versioned_path}` for {str(self)} must not exist if " + f"versioning is enabled." ) return versioned_path @@ -661,8 +661,8 @@ def exists(self) -> bool: except VersionNotFoundError: return False except Exception as exc: # SKIP_IF_NO_SPARK - message = "Failed during exists check for data set {}.\n{}".format( - str(self), str(exc) + message = ( + f"Failed during exists check for data set {str(self)}.\n{str(exc)}" ) raise DataSetError(message) from exc @@ -696,7 +696,7 @@ def _parse_filepath(filepath: str) -> Dict[str, str]: if protocol == "file": windows_path = re.match(r"^/([a-zA-Z])[:|]([\\/].*)$", path) if windows_path: - path = "{}:{}".format(*windows_path.groups()) + path = ":".join(windows_path.groups()) options = {"protocol": protocol, "path": path} From d681a8226ffb8bba8454e05ca41ff8f4cadc5210 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Mon, 1 Nov 2021 15:06:30 +0000 Subject: [PATCH 27/44] Remove McK headers + Licence from codebase + docs (#1285) * Reverting license to standard Apache 2.0 * Removing make legal * Delete license_and_headers.py * Removed all occurrence of the copyright header from codebase * Removed copyrighted header from build-docs.sh * Removing copyright footer from all docs * Changes based on review * removed whitespace * cleanup --- kedro/io/__init__.py | 28 ---------------------------- kedro/io/core.py | 28 ---------------------------- kedro/utils.py | 28 ---------------------------- 3 files changed, 84 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index dde54f72a..f172dff3a 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """``kedro.io`` provides functionality to read and write to a number of data sets. At core of the library is ``AbstractDataSet`` which allows implementation of various ``AbstractDataSet``s. diff --git a/kedro/io/core.py b/kedro/io/core.py index 814712b53..7d4e0b73e 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module provides a set of classes which underpin the data loading and saving functionality provided by ``kedro.io``. """ diff --git a/kedro/utils.py b/kedro/utils.py index 2411de2c4..4c57b7911 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -1,31 +1,3 @@ -# Copyright 2021 QuantumBlack Visual Analytics Limited -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND -# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS -# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo -# (either separately or in combination, "QuantumBlack Trademarks") are -# trademarks of QuantumBlack. The License does not grant you any right or -# license to the QuantumBlack Trademarks. You may not use the QuantumBlack -# Trademarks or any confusingly similar mark as a trademark for your product, -# or use the QuantumBlack Trademarks in any other manner that might cause -# confusion in the marketplace, including but not limited to in advertising, -# on websites, or on software. -# -# See the License for the specific language governing permissions and -# limitations under the License. - """This module provides a set of helper functions being used across different components of kedro package. """ From 1ea8a92d420227b1acc5247a6650e4c411714bd9 Mon Sep 17 00:00:00 2001 From: Merel Theisen <49397448+MerelTheisenQB@users.noreply.github.com> Date: Mon, 8 Nov 2021 14:02:14 +0000 Subject: [PATCH 28/44] Fix versioning tracking datasets (#1016) --- kedro/io/core.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 7d4e0b73e..f765f7484 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -380,7 +380,6 @@ def parse_dataset_definition( "`type` class path does not support relative " "paths or paths ending with a dot." ) - class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES) trials = (_load_obj(class_path) for class_path in class_paths) @@ -407,7 +406,12 @@ def parse_dataset_definition( ) logging.getLogger(__name__).warning(message, VERSION_KEY) del config[VERSION_KEY] - if config.pop(VERSIONED_FLAG_KEY, False): # data set is versioned + + # dataset is either versioned explicitly by the user or versioned is set to true by default + # on the dataset + if config.pop(VERSIONED_FLAG_KEY, False) or getattr( + class_obj, VERSIONED_FLAG_KEY, False + ): config[VERSION_KEY] = Version(load_version, save_version) return class_obj, config From 5c4ff50d00b42dd12b5158488c4785efb7d83393 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 8 Dec 2021 08:01:15 -0500 Subject: [PATCH 29/44] Ensure path is of type `str` if `overwrite` is set (#1094) --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f765f7484..075b8dacf 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -704,7 +704,7 @@ def get_protocol_and_path(filepath: str, version: Version = None) -> Tuple[str, protocol = options_dict["protocol"] if protocol in HTTP_PROTOCOLS: - if version: + if version is not None: raise DataSetError( "HTTP(s) DataSet doesn't support versioning. " "Please remove version flag from the dataset configuration." From 51e589d78af04076df481980e930c749afe4f67e Mon Sep 17 00:00:00 2001 From: Puneet Saini Date: Mon, 7 Feb 2022 21:39:29 +0530 Subject: [PATCH 30/44] Fix error message whitespace (#1206) Signed-off-by: Puneet Saini --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 075b8dacf..80743ddab 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -387,7 +387,7 @@ def parse_dataset_definition( class_obj = next(obj for obj in trials if obj is not None) except StopIteration as exc: raise DataSetError( - f"Class `{class_obj}` not found or one of its dependencies" + f"Class `{class_obj}` not found or one of its dependencies " f"has not been installed." ) from exc From af55fd0d5ab9f729fb9fed3986e90e8f054aaa10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lorena=20B=C4=83lan?= Date: Thu, 31 Mar 2022 14:18:17 +0100 Subject: [PATCH 31/44] Merge pull request #1397 from kedro-org/develop Merge develop into main in preparation for 0.18.0 release --- kedro/io/__init__.py | 10 +++------- kedro/io/core.py | 4 ++-- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index f172dff3a..de6f6e49e 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -13,19 +13,15 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_with_default import DataCatalogWithDefault -from .lambda_data_set import LambdaDataSet -from .memory_data_set import MemoryDataSet -from .partitioned_data_set import IncrementalDataSet, PartitionedDataSet -from .transformers import AbstractTransformer +from .lambda_dataset import LambdaDataSet +from .memory_dataset import MemoryDataSet +from .partitioned_dataset import IncrementalDataSet, PartitionedDataSet __all__ = [ "AbstractDataSet", - "AbstractTransformer", "AbstractVersionedDataSet", "CachedDataSet", "DataCatalog", - "DataCatalogWithDefault", "DataSetAlreadyExistsError", "DataSetError", "DataSetNotFoundError", diff --git a/kedro/io/core.py b/kedro/io/core.py index 80743ddab..6410d3018 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -438,7 +438,7 @@ def _load_obj(class_path: str) -> Optional[object]: f"{exc} Please see the documentation on how to " f"install relevant dependencies for {class_path}:\n" f"https://kedro.readthedocs.io/en/stable/" - f"04_kedro_project_setup/01_dependencies.html" + f"kedro_project_setup/dependencies.html" ) from exc return None @@ -521,7 +521,7 @@ def __init__( self._exists_function = exists_function or _local_exists self._glob_function = glob_function or iglob # 1 entry for load version, 1 for save version - self._version_cache = Cache(maxsize=2) + self._version_cache = Cache(maxsize=2) # type: Cache # 'key' is set to prevent cache key overlapping for load and save: # https://cachetools.readthedocs.io/en/stable/#cachetools.cachedmethod From 487eb3349d739711b7b83b20909f2e4cd3592d3f Mon Sep 17 00:00:00 2001 From: philomine Date: Tue, 24 May 2022 11:05:42 +0200 Subject: [PATCH 32/44] Add abfss to the list of cloud protocols (#1348) Signed-off-by: philomene.bobichon@konecranes.com --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 6410d3018..d8f676e8c 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -28,7 +28,7 @@ VERSION_KEY = "version" HTTP_PROTOCOLS = ("http", "https") PROTOCOL_DELIMITER = "://" -CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs") +CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs", "abfss") class DataSetError(Exception): From 35c1e4531602f2c2dbeb71f1f994e2eb76af9da5 Mon Sep 17 00:00:00 2001 From: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> Date: Wed, 8 Jun 2022 15:43:58 +0100 Subject: [PATCH 33/44] Replace backticks with single quotes (#1584) * Replace back-ticks with single quotes Signed-off-by: Ahdra Merali * Replace more backticks Signed-off-by: Ahdra Merali * Linting Signed-off-by: Ahdra Merali * Update kedro/extras/datasets/pandas/gbq_dataset.py Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> * Update kedro/framework/cli/project.py Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> * Fix tests pt 1 Signed-off-by: Ahdra Merali * Fix tests pt 2 Signed-off-by: Ahdra Merali * Fix tests pt 3 Signed-off-by: Ahdra Merali * Change quotes to follow convention Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> * Change quotes to follow convention pt 2 Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> * Fix tests Signed-off-by: Ahdra Merali * Fix e2e tests Signed-off-by: Ahdra Merali Co-authored-by: Antony Milne <49395058+AntonyMilneQB@users.noreply.github.com> --- kedro/io/core.py | 54 ++++++++++++++++++++++++------------------------ kedro/utils.py | 2 +- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index d8f676e8c..c0f13bf9a 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -144,7 +144,7 @@ def from_config( except Exception as exc: raise DataSetError( f"An exception occurred when parsing config " - f"for DataSet `{name}`:\n{str(exc)}" + f"for DataSet '{name}':\n{str(exc)}" ) from exc try: @@ -152,12 +152,12 @@ def from_config( except TypeError as err: raise DataSetError( f"\n{err}.\nDataSet '{name}' must only contain arguments valid for the " - f"constructor of `{class_obj.__module__}.{class_obj.__qualname__}`." + f"constructor of '{class_obj.__module__}.{class_obj.__qualname__}'." ) from err except Exception as err: raise DataSetError( f"\n{err}.\nFailed to instantiate DataSet '{name}' " - f"of type `{class_obj.__module__}.{class_obj.__qualname__}`." + f"of type '{class_obj.__module__}.{class_obj.__qualname__}'." ) from err return data_set @@ -203,7 +203,7 @@ def save(self, data: Any) -> None: """ if data is None: - raise DataSetError("Saving `None` to a `DataSet` is not allowed") + raise DataSetError("Saving 'None' to a 'DataSet' is not allowed") try: self._logger.debug("Saving %s", str(self)) @@ -246,22 +246,22 @@ def _to_str(obj, is_root=False): @abc.abstractmethod def _load(self) -> Any: raise NotImplementedError( - f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " - f"it must implement the `_load` method" + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_load' method" ) @abc.abstractmethod def _save(self, data: Any) -> None: raise NotImplementedError( - f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " - f"it must implement the `_save` method" + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_save' method" ) @abc.abstractmethod def _describe(self) -> Dict[str, Any]: raise NotImplementedError( - f"`{self.__class__.__name__}` is a subclass of AbstractDataSet and " - f"it must implement the `_describe` method" + f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " + f"it must implement the '_describe' method" ) def exists(self) -> bool: @@ -286,7 +286,7 @@ def exists(self) -> bool: def _exists(self) -> bool: self._logger.warning( - "`exists()` not implemented for `%s`. Assuming output does not exist.", + "'exists()' not implemented for '%s'. Assuming output does not exist.", self.__class__.__name__, ) return False @@ -337,9 +337,9 @@ class Version(namedtuple("Version", ["load", "save"])): _CONSISTENCY_WARNING = ( - "Save version `{}` did not match load version `{}` for {}. This is strongly " - "discouraged due to inconsistencies it may cause between `save` and " - "`load` operations. Please refrain from setting exact load version for " + "Save version '{}' did not match load version '{}' for {}. This is strongly " + "discouraged due to inconsistencies it may cause between 'save' and " + "'load' operations. Please refrain from setting exact load version for " "intermediate data sets where possible to avoid this warning." ) @@ -371,13 +371,13 @@ def parse_dataset_definition( config = copy.deepcopy(config) if "type" not in config: - raise DataSetError("`type` is missing from DataSet catalog configuration") + raise DataSetError("'type' is missing from DataSet catalog configuration") class_obj = config.pop("type") if isinstance(class_obj, str): if len(class_obj.strip(".")) != len(class_obj): raise DataSetError( - "`type` class path does not support relative " + "'type' class path does not support relative " "paths or paths ending with a dot." ) class_paths = (prefix + class_obj for prefix in _DEFAULT_PACKAGES) @@ -387,21 +387,21 @@ def parse_dataset_definition( class_obj = next(obj for obj in trials if obj is not None) except StopIteration as exc: raise DataSetError( - f"Class `{class_obj}` not found or one of its dependencies " + f"Class '{class_obj}' not found or one of its dependencies " f"has not been installed." ) from exc if not issubclass(class_obj, AbstractDataSet): raise DataSetError( - f"DataSet type `{class_obj.__module__}.{class_obj.__qualname__}` " - f"is invalid: all data set types must extend `AbstractDataSet`." + f"DataSet type '{class_obj.__module__}.{class_obj.__qualname__}' " + f"is invalid: all data set types must extend 'AbstractDataSet'." ) if VERSION_KEY in config: # remove "version" key so that it's not passed # to the "unversioned" data set constructor message = ( - "`%s` attribute removed from data set configuration since it is a " + "'%s' attribute removed from data set configuration since it is a " "reserved word and cannot be directly specified" ) logging.getLogger(__name__).warning(message, VERSION_KEY) @@ -581,7 +581,7 @@ def _get_save_path(self) -> PurePosixPath: if self._exists_function(str(versioned_path)): raise DataSetError( - f"Save path `{versioned_path}` for {str(self)} must not exist if " + f"Save path '{versioned_path}' for {str(self)} must not exist if " f"versioning is enabled." ) @@ -603,15 +603,15 @@ def save(self, data: Any) -> None: # FileNotFoundError raised in Win, NotADirectoryError raised in Unix _default_version = "YYYY-MM-DDThh.mm.ss.sssZ" raise DataSetError( - f"Cannot save versioned dataset `{self._filepath.name}` to " - f"`{self._filepath.parent.as_posix()}` because a file with the same " + f"Cannot save versioned dataset '{self._filepath.name}' to " + f"'{self._filepath.parent.as_posix()}' because a file with the same " f"name already exists in the directory. This is likely because " f"versioning was enabled on a dataset already saved previously. Either " - f"remove `{self._filepath.name}` from the directory or manually " + f"remove '{self._filepath.name}' from the directory or manually " f"convert it into a versioned dataset by placing it in a versioned " f"directory (e.g. with default versioning format " - f"`{self._filepath.as_posix()}/{_default_version}/{self._filepath.name}" - f"`)." + f"'{self._filepath.as_posix()}/{_default_version}/{self._filepath.name}" + f"')." ) from err load_version = self.resolve_load_version() @@ -735,5 +735,5 @@ def validate_on_forbidden_chars(**kwargs): for key, value in kwargs.items(): if " " in value or ";" in value: raise DataSetError( - f"Neither white-space nor semicolon are allowed in `{key}`." + f"Neither white-space nor semicolon are allowed in '{key}'." ) diff --git a/kedro/utils.py b/kedro/utils.py index 4c57b7911..6067d96b6 100644 --- a/kedro/utils.py +++ b/kedro/utils.py @@ -24,5 +24,5 @@ def load_obj(obj_path: str, default_obj_path: str = "") -> Any: obj_name = obj_path_list[0] module_obj = importlib.import_module(obj_path) if not hasattr(module_obj, obj_name): - raise AttributeError(f"Object `{obj_name}` cannot be loaded from `{obj_path}`.") + raise AttributeError(f"Object '{obj_name}' cannot be loaded from '{obj_path}'.") return getattr(module_obj, obj_name) From d490f9ea53debea08d2cc866c98726c2579c5932 Mon Sep 17 00:00:00 2001 From: Niels Drost Date: Mon, 4 Jul 2022 13:24:02 +0200 Subject: [PATCH 34/44] Generic typings abstract dataset (#1612) --- kedro/io/core.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index c0f13bf9a..f475ef02d 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -13,7 +13,7 @@ from glob import iglob from operator import attrgetter from pathlib import Path, PurePath, PurePosixPath -from typing import Any, Callable, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Dict, Generic, List, Optional, Tuple, Type, TypeVar from urllib.parse import urlsplit from cachetools import Cache, cachedmethod @@ -66,7 +66,11 @@ class VersionNotFoundError(DataSetError): pass -class AbstractDataSet(abc.ABC): +_DI = TypeVar("_DI") +_DO = TypeVar("_DO") + + +class AbstractDataSet(abc.ABC, Generic[_DI, _DO]): """``AbstractDataSet`` is the base class for all data set implementations. All data set implementations should extend this abstract class and implement the methods marked as abstract. @@ -81,7 +85,7 @@ class AbstractDataSet(abc.ABC): >>> from kedro.io import AbstractDataSet >>> >>> - >>> class MyOwnDataSet(AbstractDataSet): + >>> class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): >>> def __init__(self, filepath, param1, param2=True): >>> self._filepath = PurePosixPath(filepath) >>> self._param1 = param1 @@ -165,7 +169,7 @@ def from_config( def _logger(self) -> logging.Logger: return logging.getLogger(__name__) - def load(self) -> Any: + def load(self) -> _DO: """Loads data by delegation to the provided load method. Returns: @@ -190,7 +194,7 @@ def load(self) -> Any: ) raise DataSetError(message) from exc - def save(self, data: Any) -> None: + def save(self, data: _DI) -> None: """Saves data by delegation to the provided save method. Args: @@ -244,14 +248,14 @@ def _to_str(obj, is_root=False): return f"{type(self).__name__}({_to_str(self._describe(), True)})" @abc.abstractmethod - def _load(self) -> Any: + def _load(self) -> _DO: raise NotImplementedError( f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " f"it must implement the '_load' method" ) @abc.abstractmethod - def _save(self, data: Any) -> None: + def _save(self, data: _DI) -> None: raise NotImplementedError( f"'{self.__class__.__name__}' is a subclass of AbstractDataSet and " f"it must implement the '_save' method" @@ -450,7 +454,7 @@ def _local_exists(filepath: str) -> bool: # SKIP_IF_NO_SPARK return filepath.exists() or any(par.is_file() for par in filepath.parents) -class AbstractVersionedDataSet(AbstractDataSet, abc.ABC): +class AbstractVersionedDataSet(AbstractDataSet[_DI, _DO], abc.ABC): """ ``AbstractVersionedDataSet`` is the base class for all versioned data set implementations. All data sets that implement versioning should extend this @@ -590,11 +594,11 @@ def _get_save_path(self) -> PurePosixPath: def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name - def load(self) -> Any: + def load(self) -> _DO: self.resolve_load_version() # Make sure last load version is set return super().load() - def save(self, data: Any) -> None: + def save(self, data: _DI) -> None: self._version_cache.clear() save_version = self.resolve_save_version() # Make sure last save version is set try: From ee39e5ef1b1b5c5825233173327d4162f1adf425 Mon Sep 17 00:00:00 2001 From: Sajid Alam <90610031+SajidAlamQB@users.noreply.github.com> Date: Fri, 9 Sep 2022 17:20:15 +0100 Subject: [PATCH 35/44] Add gdrive to CLOUD_PROTOCOLS list (#1708) --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index f475ef02d..9765e0bae 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -28,7 +28,7 @@ VERSION_KEY = "version" HTTP_PROTOCOLS = ("http", "https") PROTOCOL_DELIMITER = "://" -CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs", "abfss") +CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") class DataSetError(Exception): From c713e7d0f536331bab01fb0f0b5d78fbede80d39 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 3 Oct 2022 11:30:47 +0100 Subject: [PATCH 36/44] Update Error message for `VersionNotFoundError` to handle Permission related issues better (#1881) * Update message for VersionNotFoundError Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> * Add test for VersionNotFoundError for cloud protocols * Update test_data_catalog.py Update NoVersionFoundError test * minor linting update * update docs link + styling changes * Revert "update docs link + styling changes" This reverts commit 6088e00159a9ee844dfee312673654b6d248f931. * Update test with styling changes * Update RELEASE.md Signed-off-by: ankatiyar Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Signed-off-by: ankatiyar Co-authored-by: Ahdra Merali <90615669+AhdraMeraliQB@users.noreply.github.com> --- kedro/io/core.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 9765e0bae..fc6dea587 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -538,9 +538,16 @@ def _fetch_latest_load_version(self) -> str: most_recent = next( (path for path in version_paths if self._exists_function(path)), None ) - + protocol = getattr(self, "_protocol", None) if not most_recent: - raise VersionNotFoundError(f"Did not find any versions for {self}") + if protocol in CLOUD_PROTOCOLS: + message = ( + f"Did not find any versions for {self}. This could be " + f"due to insufficient permission." + ) + else: + message = f"Did not find any versions for {self}" + raise VersionNotFoundError(message) return PurePath(most_recent).parent.name From 781fa1e354cebbe1f01d10ef7c52462aeee4e67c Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Thu, 20 Oct 2022 11:11:19 +0100 Subject: [PATCH 37/44] Remove redundant `resolve_load_version` call (#1911) * remove a redundant function call Signed-off-by: Nok Chan * Remove redundant resolove_load_version & fix test Signed-off-by: Nok Chan * Fix HoloviewWriter tests with more specific error message pattern & Lint Signed-off-by: Nok Chan * Rename tests Signed-off-by: Nok Chan Signed-off-by: Nok Chan --- kedro/io/core.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index fc6dea587..98f2bb1d6 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -548,7 +548,6 @@ def _fetch_latest_load_version(self) -> str: else: message = f"Did not find any versions for {self}" raise VersionNotFoundError(message) - return PurePath(most_recent).parent.name # 'key' is set to prevent cache key overlapping for load and save: @@ -601,8 +600,7 @@ def _get_save_path(self) -> PurePosixPath: def _get_versioned_path(self, version: str) -> PurePosixPath: return self._filepath / version / self._filepath.name - def load(self) -> _DO: - self.resolve_load_version() # Make sure last load version is set + def load(self) -> _DO: # pylint: disable=useless-parent-delegation return super().load() def save(self, data: _DI) -> None: From 5c134e591211a0db2674860c4df81865e96c505c Mon Sep 17 00:00:00 2001 From: Nok Lam Chan Date: Wed, 2 Nov 2022 15:08:04 +0000 Subject: [PATCH 38/44] Make Kedro instantiate datasets from `kedro_dataset`with higher priority than `kedro.extras.datasets` (#1734) * Update release doc Signed-off-by: Nok Chan * Make kedro.datasets with higher priority Signed-off-by: Nok Chan * Update the library priorities Signed-off-by: Nok Chan * Update release notes Signed-off-by: Nok Chan * Add test Signed-off-by: Nok Chan * Modify test to remove the dummy obj Signed-off-by: Nok Chan * Fix mocker.spy with different API for Python 3.7 Signed-off-by: Nok Chan * Please the linter Signed-off-by: Nok Chan * Black Signed-off-by: Nok Chan Signed-off-by: Nok Chan --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 98f2bb1d6..289650a79 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -347,7 +347,7 @@ class Version(namedtuple("Version", ["load", "save"])): "intermediate data sets where possible to avoid this warning." ) -_DEFAULT_PACKAGES = ["kedro.io.", "kedro.extras.datasets.", ""] +_DEFAULT_PACKAGES = ["kedro.io.", "kedro_datasets.", "kedro.extras.datasets.", ""] def parse_dataset_definition( From 8b03ab61f3ceadc28379adc703cada17704275f0 Mon Sep 17 00:00:00 2001 From: Jannic <37243923+jmholzer@users.noreply.github.com> Date: Mon, 16 Jan 2023 15:12:19 +0000 Subject: [PATCH 39/44] Add support for Azure Data Lake Storage Gen2 URIs (#2190) * Add failing test Signed-off-by: Jannic Holzer * Add patch specific for abfss Signed-off-by: Jannic Holzer * Fix linting Signed-off-by: Jannic Holzer * Add release note Signed-off-by: Jannic Holzer * Add comment explaining adding username to abfss path Signed-off-by: Jannic Holzer * Modify release note to 'fixed' Signed-off-by: Jannic Holzer Signed-off-by: Jannic Holzer --- kedro/io/core.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 289650a79..02b3708ed 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -685,11 +685,14 @@ def _parse_filepath(filepath: str) -> Dict[str, str]: options = {"protocol": protocol, "path": path} - if parsed_path.netloc: - if protocol in CLOUD_PROTOCOLS: - host_with_port = parsed_path.netloc.rsplit("@", 1)[-1] - host = host_with_port.rsplit(":", 1)[0] - options["path"] = host + options["path"] + if parsed_path.netloc and protocol in CLOUD_PROTOCOLS: + host_with_port = parsed_path.netloc.rsplit("@", 1)[-1] + host = host_with_port.rsplit(":", 1)[0] + options["path"] = host + options["path"] + # Azure Data Lake Storage Gen2 URIs can store the container name in the + # 'username' field of a URL (@ syntax), so we need to add it to the path + if protocol == "abfss" and parsed_path.username: + options["path"] = parsed_path.username + "@" + options["path"] return options From 860a4ee590374b615aa84c0589b255d9fee59328 Mon Sep 17 00:00:00 2001 From: Miguel Rodriguez Gutierrez Date: Fri, 24 Feb 2023 09:15:59 -0600 Subject: [PATCH 40/44] Fix `s3n` and `s3a` bug by adding them to CLOUD_PROTOCOLS (#2326) Signed-off-by: Miguel Rodriguez Gutierrez --- kedro/io/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/core.py b/kedro/io/core.py index 02b3708ed..dc64e83e5 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -28,7 +28,7 @@ VERSION_KEY = "version" HTTP_PROTOCOLS = ("http", "https") PROTOCOL_DELIMITER = "://" -CLOUD_PROTOCOLS = ("s3", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") +CLOUD_PROTOCOLS = ("s3", "s3n", "s3a", "gcs", "gs", "adl", "abfs", "abfss", "gdrive") class DataSetError(Exception): From bd1ebfdeade49acd2a6aa18b4fd4a58163d7a60d Mon Sep 17 00:00:00 2001 From: brandonmeek Date: Thu, 9 Mar 2023 16:21:42 -0600 Subject: [PATCH 41/44] Added kedro/io/core.py kedro/io/__init__.py and kedro/utils.py from patch to keep history. --- .../kedro_datasets}/io/__init__.py | 11 ----------- {kedro => kedro-datasets/kedro_datasets}/io/core.py | 0 {kedro => kedro-datasets/kedro_datasets/io}/utils.py | 0 3 files changed, 11 deletions(-) rename {kedro => kedro-datasets/kedro_datasets}/io/__init__.py (58%) rename {kedro => kedro-datasets/kedro_datasets}/io/core.py (100%) rename {kedro => kedro-datasets/kedro_datasets/io}/utils.py (100%) diff --git a/kedro/io/__init__.py b/kedro-datasets/kedro_datasets/io/__init__.py similarity index 58% rename from kedro/io/__init__.py rename to kedro-datasets/kedro_datasets/io/__init__.py index de6f6e49e..9e37111c0 100644 --- a/kedro/io/__init__.py +++ b/kedro-datasets/kedro_datasets/io/__init__.py @@ -3,7 +3,6 @@ which allows implementation of various ``AbstractDataSet``s. """ -from .cached_dataset import CachedDataSet from .core import ( AbstractDataSet, AbstractVersionedDataSet, @@ -12,22 +11,12 @@ DataSetNotFoundError, Version, ) -from .data_catalog import DataCatalog -from .lambda_dataset import LambdaDataSet -from .memory_dataset import MemoryDataSet -from .partitioned_dataset import IncrementalDataSet, PartitionedDataSet __all__ = [ "AbstractDataSet", "AbstractVersionedDataSet", - "CachedDataSet", - "DataCatalog", "DataSetAlreadyExistsError", "DataSetError", "DataSetNotFoundError", - "IncrementalDataSet", - "LambdaDataSet", - "MemoryDataSet", - "PartitionedDataSet", "Version", ] diff --git a/kedro/io/core.py b/kedro-datasets/kedro_datasets/io/core.py similarity index 100% rename from kedro/io/core.py rename to kedro-datasets/kedro_datasets/io/core.py diff --git a/kedro/utils.py b/kedro-datasets/kedro_datasets/io/utils.py similarity index 100% rename from kedro/utils.py rename to kedro-datasets/kedro_datasets/io/utils.py From 85125f27021af6eba948aad5cc156261f904b38f Mon Sep 17 00:00:00 2001 From: brandonmeek Date: Thu, 9 Mar 2023 17:03:27 -0600 Subject: [PATCH 42/44] Make AbstractDataSets from `kedro_datasets` instead of `kedro` --- kedro-datasets/kedro_datasets/api/api_dataset.py | 2 +- .../kedro_datasets/biosequence/biosequence_dataset.py | 2 +- kedro-datasets/kedro_datasets/dask/parquet_dataset.py | 2 +- kedro-datasets/kedro_datasets/email/message_dataset.py | 2 +- kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py | 2 +- kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py | 2 +- kedro-datasets/kedro_datasets/io/core.py | 6 +++--- kedro-datasets/kedro_datasets/json/json_dataset.py | 2 +- .../kedro_datasets/matplotlib/matplotlib_writer.py | 2 +- kedro-datasets/kedro_datasets/networkx/gml_dataset.py | 2 +- kedro-datasets/kedro_datasets/networkx/graphml_dataset.py | 2 +- kedro-datasets/kedro_datasets/networkx/json_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/csv_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/excel_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/feather_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/gbq_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/generic_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/hdf_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/json_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/parquet_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/sql_dataset.py | 2 +- kedro-datasets/kedro_datasets/pandas/xml_dataset.py | 2 +- kedro-datasets/kedro_datasets/pickle/pickle_dataset.py | 2 +- kedro-datasets/kedro_datasets/pillow/image_dataset.py | 2 +- kedro-datasets/kedro_datasets/plotly/json_dataset.py | 2 +- kedro-datasets/kedro_datasets/plotly/plotly_dataset.py | 2 +- kedro-datasets/kedro_datasets/polars/csv_dataset.py | 2 +- kedro-datasets/kedro_datasets/redis/redis_dataset.py | 2 +- kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py | 2 +- kedro-datasets/kedro_datasets/spark/deltatable_dataset.py | 2 +- kedro-datasets/kedro_datasets/spark/spark_dataset.py | 2 +- kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py | 2 +- kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py | 2 +- kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py | 2 +- .../kedro_datasets/tensorflow/tensorflow_model_dataset.py | 2 +- kedro-datasets/kedro_datasets/text/text_dataset.py | 2 +- kedro-datasets/kedro_datasets/tracking/json_dataset.py | 2 +- kedro-datasets/kedro_datasets/tracking/metrics_dataset.py | 2 +- kedro-datasets/kedro_datasets/video/video_dataset.py | 2 +- kedro-datasets/kedro_datasets/yaml/yaml_dataset.py | 2 +- kedro-datasets/test_requirements.txt | 1 + kedro-datasets/tests/api/test_api_dataset.py | 2 +- .../tests/bioinformatics/test_biosequence_dataset.py | 4 ++-- kedro-datasets/tests/conftest.py | 2 +- kedro-datasets/tests/dask/test_parquet_dataset.py | 2 +- kedro-datasets/tests/email/test_message_dataset.py | 4 ++-- kedro-datasets/tests/geojson/test_geojson_dataset.py | 4 ++-- kedro-datasets/tests/holoviews/test_holoviews_writer.py | 4 ++-- kedro-datasets/tests/json/test_json_dataset.py | 4 ++-- kedro-datasets/tests/libsvm/test_svmlight_dataset.py | 4 ++-- kedro-datasets/tests/matplotlib/test_matplotlib_writer.py | 2 +- kedro-datasets/tests/networkx/test_gml_dataset.py | 4 ++-- kedro-datasets/tests/networkx/test_graphml_dataset.py | 4 ++-- kedro-datasets/tests/networkx/test_json_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_csv_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_excel_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_feather_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_gbq_dataset.py | 2 +- kedro-datasets/tests/pandas/test_generic_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_hdf_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_json_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_parquet_dataset.py | 4 ++-- kedro-datasets/tests/pandas/test_sql_dataset.py | 2 +- kedro-datasets/tests/pandas/test_xml_dataset.py | 4 ++-- kedro-datasets/tests/pickle/test_pickle_dataset.py | 4 ++-- kedro-datasets/tests/pillow/test_image_dataset.py | 4 ++-- kedro-datasets/tests/plotly/test_json_dataset.py | 4 ++-- kedro-datasets/tests/plotly/test_plotly_dataset.py | 4 ++-- kedro-datasets/tests/polars/test_csv_dataset.py | 4 ++-- kedro-datasets/tests/redis/test_redis_dataset.py | 2 +- kedro-datasets/tests/snowflake/test_snowpark_dataset.py | 2 +- kedro-datasets/tests/spark/test_deltatable_dataset.py | 3 ++- kedro-datasets/tests/spark/test_spark_dataset.py | 5 +++-- kedro-datasets/tests/spark/test_spark_hive_dataset.py | 2 +- kedro-datasets/tests/spark/test_spark_jdbc_dataset.py | 2 +- .../tests/tensorflow/test_tensorflow_model_dataset.py | 4 ++-- kedro-datasets/tests/text/test_text_dataset.py | 4 ++-- kedro-datasets/tests/tracking/test_json_dataset.py | 4 ++-- kedro-datasets/tests/tracking/test_metrics_dataset.py | 4 ++-- kedro-datasets/tests/video/test_video_dataset.py | 2 +- kedro-datasets/tests/yaml/test_yaml_dataset.py | 4 ++-- 81 files changed, 113 insertions(+), 110 deletions(-) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 4f0ffb4cc..cdb950331 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -4,7 +4,7 @@ from typing import Any, Dict, Iterable, List, NoReturn, Union import requests -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError from requests.auth import AuthBase diff --git a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py index 7c45743da..726e8dbca 100644 --- a/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py +++ b/kedro-datasets/kedro_datasets/biosequence/biosequence_dataset.py @@ -7,7 +7,7 @@ import fsspec from Bio import SeqIO -from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path +from kedro_datasets.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path class BioSequenceDataSet(AbstractDataSet[List, List]): diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index f02144892..3b3959e64 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -7,7 +7,7 @@ import dask.dataframe as dd import fsspec import triad -from kedro.io.core import AbstractDataSet, get_protocol_and_path +from kedro_datasets.io.core import AbstractDataSet, get_protocol_and_path class ParquetDataSet(AbstractDataSet[dd.DataFrame, dd.DataFrame]): diff --git a/kedro-datasets/kedro_datasets/email/message_dataset.py b/kedro-datasets/kedro_datasets/email/message_dataset.py index 0b8623f63..9ab59cc76 100644 --- a/kedro-datasets/kedro_datasets/email/message_dataset.py +++ b/kedro-datasets/kedro_datasets/email/message_dataset.py @@ -11,7 +11,7 @@ from typing import Any, Dict import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py index ba9237909..afccf4a04 100644 --- a/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py +++ b/kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py @@ -8,7 +8,7 @@ import fsspec import geopandas as gpd -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py index 7f61909b9..e9f12e645 100644 --- a/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py +++ b/kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py @@ -8,7 +8,7 @@ import fsspec import holoviews as hv -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/io/core.py b/kedro-datasets/kedro_datasets/io/core.py index dc64e83e5..b8ac06f7c 100644 --- a/kedro-datasets/kedro_datasets/io/core.py +++ b/kedro-datasets/kedro_datasets/io/core.py @@ -19,7 +19,7 @@ from cachetools import Cache, cachedmethod from cachetools.keys import hashkey -from kedro.utils import load_obj +from .utils import load_obj warnings.simplefilter("default", DeprecationWarning) @@ -82,7 +82,7 @@ class AbstractDataSet(abc.ABC, Generic[_DI, _DO]): >>> from pathlib import Path, PurePosixPath >>> import pandas as pd - >>> from kedro.io import AbstractDataSet + >>> from kedro_datasets.io import AbstractDataSet >>> >>> >>> class MyOwnDataSet(AbstractDataSet[pd.DataFrame, pd.DataFrame]): @@ -465,7 +465,7 @@ class AbstractVersionedDataSet(AbstractDataSet[_DI, _DO], abc.ABC): >>> from pathlib import Path, PurePosixPath >>> import pandas as pd - >>> from kedro.io import AbstractVersionedDataSet + >>> from kedro_datasets.io import AbstractVersionedDataSet >>> >>> >>> class MyOwnDataSet(AbstractVersionedDataSet): diff --git a/kedro-datasets/kedro_datasets/json/json_dataset.py b/kedro-datasets/kedro_datasets/json/json_dataset.py index ad86c9a17..58c08a772 100644 --- a/kedro-datasets/kedro_datasets/json/json_dataset.py +++ b/kedro-datasets/kedro_datasets/json/json_dataset.py @@ -7,7 +7,7 @@ from typing import Any, Dict import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py index 3fc396cb1..4ea44efd3 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py +++ b/kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py @@ -9,7 +9,7 @@ import fsspec import matplotlib.pyplot as plt -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py index bc8d4f86f..1b73facd9 100644 --- a/kedro-datasets/kedro_datasets/networkx/gml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/gml_dataset.py @@ -9,7 +9,7 @@ import fsspec import networkx -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, Version, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py index 2105fb67f..36230e069 100644 --- a/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/graphml_dataset.py @@ -8,7 +8,7 @@ import fsspec import networkx -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, Version, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/networkx/json_dataset.py b/kedro-datasets/kedro_datasets/networkx/json_dataset.py index 8cc436721..1dd6ef62c 100644 --- a/kedro-datasets/kedro_datasets/networkx/json_dataset.py +++ b/kedro-datasets/kedro_datasets/networkx/json_dataset.py @@ -9,7 +9,7 @@ import fsspec import networkx -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, Version, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py index 7b20813f3..cebefc2b4 100644 --- a/kedro-datasets/kedro_datasets/pandas/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/csv_dataset.py @@ -9,7 +9,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py index 4a981bc11..729b48815 100644 --- a/kedro-datasets/kedro_datasets/pandas/excel_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/excel_dataset.py @@ -9,7 +9,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py index 1116d4168..c6003b02b 100644 --- a/kedro-datasets/kedro_datasets/pandas/feather_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/feather_dataset.py @@ -10,7 +10,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, Version, diff --git a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py index c0122a6c0..efeb27b60 100644 --- a/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/gbq_dataset.py @@ -11,7 +11,7 @@ from google.cloud import bigquery from google.cloud.exceptions import NotFound from google.oauth2.credentials import Credentials -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractDataSet, DataSetError, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py index 86e347d70..8c449e298 100644 --- a/kedro-datasets/kedro_datasets/pandas/generic_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/generic_dataset.py @@ -8,7 +8,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py index f11fe320f..fcb54fdb8 100644 --- a/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/hdf_dataset.py @@ -8,7 +8,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/pandas/json_dataset.py b/kedro-datasets/kedro_datasets/pandas/json_dataset.py index d29ef57bd..7e0f51a00 100644 --- a/kedro-datasets/kedro_datasets/pandas/json_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/json_dataset.py @@ -9,7 +9,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py index acb478bd9..d4c48c8a1 100644 --- a/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/parquet_dataset.py @@ -9,7 +9,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py index dd5d636a1..fe9ff8e4c 100644 --- a/kedro-datasets/kedro_datasets/pandas/sql_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/sql_dataset.py @@ -8,7 +8,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractDataSet, DataSetError, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py index ca8fc0dd2..35fb4d54e 100644 --- a/kedro-datasets/kedro_datasets/pandas/xml_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/xml_dataset.py @@ -9,7 +9,7 @@ import fsspec import pandas as pd -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py index 436fba29a..c6f0e9cd9 100644 --- a/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py +++ b/kedro-datasets/kedro_datasets/pickle/pickle_dataset.py @@ -9,7 +9,7 @@ from typing import Any, Dict import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/pillow/image_dataset.py b/kedro-datasets/kedro_datasets/pillow/image_dataset.py index ca939b722..8083d04cd 100644 --- a/kedro-datasets/kedro_datasets/pillow/image_dataset.py +++ b/kedro-datasets/kedro_datasets/pillow/image_dataset.py @@ -6,7 +6,7 @@ from typing import Any, Dict import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/plotly/json_dataset.py b/kedro-datasets/kedro_datasets/plotly/json_dataset.py index f819dd338..f28123361 100644 --- a/kedro-datasets/kedro_datasets/plotly/json_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/json_dataset.py @@ -7,7 +7,7 @@ import fsspec import plotly.io as pio -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, Version, get_filepath_str, diff --git a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py index 1bb0acef6..11d638cf6 100644 --- a/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py +++ b/kedro-datasets/kedro_datasets/plotly/plotly_dataset.py @@ -7,7 +7,7 @@ import pandas as pd import plotly.express as px -from kedro.io.core import Version +from kedro_datasets.io.core import Version from plotly import graph_objects as go from .json_dataset import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 60a0d456a..52bb96a1e 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -9,7 +9,7 @@ import fsspec import polars as pl -from kedro.io.core import ( +from kedro_datasets.io.core import ( PROTOCOL_DELIMITER, AbstractVersionedDataSet, DataSetError, diff --git a/kedro-datasets/kedro_datasets/redis/redis_dataset.py b/kedro-datasets/kedro_datasets/redis/redis_dataset.py index 6d2f80df9..085ed68aa 100644 --- a/kedro-datasets/kedro_datasets/redis/redis_dataset.py +++ b/kedro-datasets/kedro_datasets/redis/redis_dataset.py @@ -8,7 +8,7 @@ from typing import Any, Dict import redis -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError class PickleDataSet(AbstractDataSet[Any, Any]): diff --git a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py index e0ea1c1db..3746681d1 100644 --- a/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py +++ b/kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py @@ -5,7 +5,7 @@ from typing import Any, Dict import snowflake.snowpark as sp -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError logger = logging.getLogger(__name__) diff --git a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py index 34ee6f6a5..15fda12e9 100644 --- a/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/deltatable_dataset.py @@ -5,7 +5,7 @@ from typing import NoReturn from delta.tables import DeltaTable -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError from pyspark.sql import SparkSession from pyspark.sql.utils import AnalysisException diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index d366eae08..d2cf111f0 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -13,7 +13,7 @@ import fsspec from hdfs import HdfsError, InsecureClient -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py index 08b0666ea..1c8e3d6bf 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_hive_dataset.py @@ -5,7 +5,7 @@ from copy import deepcopy from typing import Any, Dict, List -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError from pyspark.sql import DataFrame, SparkSession, Window from pyspark.sql.functions import col, lit, row_number diff --git a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py index aab501f26..d02b2124a 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py @@ -3,7 +3,7 @@ from copy import deepcopy from typing import Any, Dict -from kedro.io.core import AbstractDataSet, DataSetError +from kedro_datasets.io.core import AbstractDataSet, DataSetError from pyspark.sql import DataFrame, SparkSession __all__ = ["SparkJDBCDataSet"] diff --git a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py index f909c1976..baf34f8cb 100644 --- a/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py +++ b/kedro-datasets/kedro_datasets/svmlight/svmlight_dataset.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Optional, Tuple, Union import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py index 544aadb06..1683cb7ec 100644 --- a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py +++ b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py @@ -8,7 +8,7 @@ import fsspec import tensorflow as tf -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/text/text_dataset.py b/kedro-datasets/kedro_datasets/text/text_dataset.py index 0bb559e29..3552d463c 100644 --- a/kedro-datasets/kedro_datasets/text/text_dataset.py +++ b/kedro-datasets/kedro_datasets/text/text_dataset.py @@ -6,7 +6,7 @@ from typing import Any, Dict import fsspec -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py index 4235df999..2f7e8f4a5 100644 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/json_dataset.py @@ -4,7 +4,7 @@ """ from typing import NoReturn -from kedro.io.core import DataSetError +from kedro_datasets.io.core import DataSetError from kedro_datasets.json import JSONDataSet as JDS diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py index 7c7546a85..b63b3abfd 100644 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py @@ -6,7 +6,7 @@ import json from typing import Dict, NoReturn -from kedro.io.core import DataSetError, get_filepath_str +from kedro_datasets.io.core import DataSetError, get_filepath_str from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/kedro_datasets/video/video_dataset.py b/kedro-datasets/kedro_datasets/video/video_dataset.py index 03311146d..69e19f479 100644 --- a/kedro-datasets/kedro_datasets/video/video_dataset.py +++ b/kedro-datasets/kedro_datasets/video/video_dataset.py @@ -13,7 +13,7 @@ import fsspec import numpy as np import PIL.Image -from kedro.io.core import AbstractDataSet, get_protocol_and_path +from kedro_datasets.io.core import AbstractDataSet, get_protocol_and_path class SlicedVideo: diff --git a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py index f2a3c2696..45009cad1 100644 --- a/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py +++ b/kedro-datasets/kedro_datasets/yaml/yaml_dataset.py @@ -7,7 +7,7 @@ import fsspec import yaml -from kedro.io.core import ( +from kedro_datasets.io.core import ( AbstractVersionedDataSet, DataSetError, Version, diff --git a/kedro-datasets/test_requirements.txt b/kedro-datasets/test_requirements.txt index c3baae0c7..f3a83283c 100644 --- a/kedro-datasets/test_requirements.txt +++ b/kedro-datasets/test_requirements.txt @@ -20,6 +20,7 @@ Jinja2<3.1.0 joblib>=0.14 jupyterlab~=3.0 jupyter~=1.0 +kedro~=0.18.4 lxml~=4.6 matplotlib>=3.0.3, <3.4; python_version < '3.10' # 3.4.0 breaks holoviews matplotlib>=3.5, <3.6; python_version == '3.10' diff --git a/kedro-datasets/tests/api/test_api_dataset.py b/kedro-datasets/tests/api/test_api_dataset.py index c84290750..9de4604a5 100644 --- a/kedro-datasets/tests/api/test_api_dataset.py +++ b/kedro-datasets/tests/api/test_api_dataset.py @@ -5,7 +5,7 @@ import pytest import requests import requests_mock -from kedro.io.core import DataSetError +from kedro_datasets.io.core import DataSetError from kedro_datasets.api import APIDataSet diff --git a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py index 24666baaf..710791d28 100644 --- a/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py +++ b/kedro-datasets/tests/bioinformatics/test_biosequence_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.biosequence import BioSequenceDataSet diff --git a/kedro-datasets/tests/conftest.py b/kedro-datasets/tests/conftest.py index 91d19f646..fc2916923 100644 --- a/kedro-datasets/tests/conftest.py +++ b/kedro-datasets/tests/conftest.py @@ -5,7 +5,7 @@ https://docs.pytest.org/en/latest/fixture.html """ -from kedro.io.core import generate_timestamp +from kedro_datasets.io.core import generate_timestamp from pytest import fixture diff --git a/kedro-datasets/tests/dask/test_parquet_dataset.py b/kedro-datasets/tests/dask/test_parquet_dataset.py index a2d89f060..8f41f5d38 100644 --- a/kedro-datasets/tests/dask/test_parquet_dataset.py +++ b/kedro-datasets/tests/dask/test_parquet_dataset.py @@ -4,7 +4,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pytest -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from moto import mock_s3 from pandas.util.testing import assert_frame_equal from s3fs import S3FileSystem diff --git a/kedro-datasets/tests/email/test_message_dataset.py b/kedro-datasets/tests/email/test_message_dataset.py index 100daba52..3b8945aaa 100644 --- a/kedro-datasets/tests/email/test_message_dataset.py +++ b/kedro-datasets/tests/email/test_message_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.email import EmailMessageDataSet diff --git a/kedro-datasets/tests/geojson/test_geojson_dataset.py b/kedro-datasets/tests/geojson/test_geojson_dataset.py index 52fd292ff..cefc61e9c 100644 --- a/kedro-datasets/tests/geojson/test_geojson_dataset.py +++ b/kedro-datasets/tests/geojson/test_geojson_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp from pandas.util.testing import assert_frame_equal from s3fs import S3FileSystem from shapely.geometry import Point diff --git a/kedro-datasets/tests/holoviews/test_holoviews_writer.py b/kedro-datasets/tests/holoviews/test_holoviews_writer.py index f4f91383e..9eb013f49 100644 --- a/kedro-datasets/tests/holoviews/test_holoviews_writer.py +++ b/kedro-datasets/tests/holoviews/test_holoviews_writer.py @@ -7,8 +7,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.holoviews import HoloviewsWriter diff --git a/kedro-datasets/tests/json/test_json_dataset.py b/kedro-datasets/tests/json/test_json_dataset.py index 621e51fcd..35e923a3e 100644 --- a/kedro-datasets/tests/json/test_json_dataset.py +++ b/kedro-datasets/tests/json/test_json_dataset.py @@ -4,8 +4,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.json import JSONDataSet diff --git a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py index 8fff3edd2..e20cf2f4d 100644 --- a/kedro-datasets/tests/libsvm/test_svmlight_dataset.py +++ b/kedro-datasets/tests/libsvm/test_svmlight_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.svmlight import SVMLightDataSet diff --git a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py index 0745452c6..d25631199 100644 --- a/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py +++ b/kedro-datasets/tests/matplotlib/test_matplotlib_writer.py @@ -5,7 +5,7 @@ import matplotlib import matplotlib.pyplot as plt import pytest -from kedro.io import DataSetError, Version +from kedro_datasets.io import DataSetError, Version from moto import mock_s3 from s3fs import S3FileSystem diff --git a/kedro-datasets/tests/networkx/test_gml_dataset.py b/kedro-datasets/tests/networkx/test_gml_dataset.py index a3a89eca7..5a2ac51c8 100644 --- a/kedro-datasets/tests/networkx/test_gml_dataset.py +++ b/kedro-datasets/tests/networkx/test_gml_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.networkx import GMLDataSet diff --git a/kedro-datasets/tests/networkx/test_graphml_dataset.py b/kedro-datasets/tests/networkx/test_graphml_dataset.py index 4e0dcf40d..fea1796dd 100644 --- a/kedro-datasets/tests/networkx/test_graphml_dataset.py +++ b/kedro-datasets/tests/networkx/test_graphml_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.networkx import GraphMLDataSet diff --git a/kedro-datasets/tests/networkx/test_json_dataset.py b/kedro-datasets/tests/networkx/test_json_dataset.py index 4d6e582a8..ca1d59933 100644 --- a/kedro-datasets/tests/networkx/test_json_dataset.py +++ b/kedro-datasets/tests/networkx/test_json_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.networkx import JSONDataSet diff --git a/kedro-datasets/tests/pandas/test_csv_dataset.py b/kedro-datasets/tests/pandas/test_csv_dataset.py index 267144ecc..92fb6d046 100644 --- a/kedro-datasets/tests/pandas/test_csv_dataset.py +++ b/kedro-datasets/tests/pandas/test_csv_dataset.py @@ -10,8 +10,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp from moto import mock_s3 from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_excel_dataset.py b/kedro-datasets/tests/pandas/test_excel_dataset.py index c568d15d0..c88d826a8 100644 --- a/kedro-datasets/tests/pandas/test_excel_dataset.py +++ b/kedro-datasets/tests/pandas/test_excel_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_feather_dataset.py b/kedro-datasets/tests/pandas/test_feather_dataset.py index 80c1ce678..1acbc4295 100644 --- a/kedro-datasets/tests/pandas/test_feather_dataset.py +++ b/kedro-datasets/tests/pandas/test_feather_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_gbq_dataset.py b/kedro-datasets/tests/pandas/test_gbq_dataset.py index e239dbaba..fcdacffc2 100644 --- a/kedro-datasets/tests/pandas/test_gbq_dataset.py +++ b/kedro-datasets/tests/pandas/test_gbq_dataset.py @@ -3,7 +3,7 @@ import pandas as pd import pytest from google.cloud.exceptions import NotFound -from kedro.io.core import DataSetError +from kedro_datasets.io.core import DataSetError from pandas.testing import assert_frame_equal from kedro_datasets.pandas import GBQQueryDataSet, GBQTableDataSet diff --git a/kedro-datasets/tests/pandas/test_generic_dataset.py b/kedro-datasets/tests/pandas/test_generic_dataset.py index 6f40bb0d4..f05027ff8 100644 --- a/kedro-datasets/tests/pandas/test_generic_dataset.py +++ b/kedro-datasets/tests/pandas/test_generic_dataset.py @@ -7,8 +7,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError, Version -from kedro.io.core import PROTOCOL_DELIMITER, generate_timestamp +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import PROTOCOL_DELIMITER, generate_timestamp from pandas._testing import assert_frame_equal from s3fs import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_hdf_dataset.py b/kedro-datasets/tests/pandas/test_hdf_dataset.py index 563ba63d9..d2c20a3d4 100644 --- a/kedro-datasets/tests/pandas/test_hdf_dataset.py +++ b/kedro-datasets/tests/pandas/test_hdf_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_json_dataset.py b/kedro-datasets/tests/pandas/test_json_dataset.py index df2e856d5..797a42770 100644 --- a/kedro-datasets/tests/pandas/test_json_dataset.py +++ b/kedro-datasets/tests/pandas/test_json_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_parquet_dataset.py b/kedro-datasets/tests/pandas/test_parquet_dataset.py index 2d7ce2996..a4963b36a 100644 --- a/kedro-datasets/tests/pandas/test_parquet_dataset.py +++ b/kedro-datasets/tests/pandas/test_parquet_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from pyarrow.fs import FSSpecHandler, PyFileSystem from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pandas/test_sql_dataset.py b/kedro-datasets/tests/pandas/test_sql_dataset.py index aa9fe8d17..da8982588 100644 --- a/kedro-datasets/tests/pandas/test_sql_dataset.py +++ b/kedro-datasets/tests/pandas/test_sql_dataset.py @@ -5,7 +5,7 @@ import pandas as pd import pytest import sqlalchemy -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from kedro_datasets.pandas import SQLQueryDataSet, SQLTableDataSet diff --git a/kedro-datasets/tests/pandas/test_xml_dataset.py b/kedro-datasets/tests/pandas/test_xml_dataset.py index bd62ea586..645d2eb6c 100644 --- a/kedro-datasets/tests/pandas/test_xml_dataset.py +++ b/kedro-datasets/tests/pandas/test_xml_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pickle/test_pickle_dataset.py b/kedro-datasets/tests/pickle/test_pickle_dataset.py index fb95681a3..c444bdf49 100644 --- a/kedro-datasets/tests/pickle/test_pickle_dataset.py +++ b/kedro-datasets/tests/pickle/test_pickle_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/pillow/test_image_dataset.py b/kedro-datasets/tests/pillow/test_image_dataset.py index ea500b20d..2575065cd 100644 --- a/kedro-datasets/tests/pillow/test_image_dataset.py +++ b/kedro-datasets/tests/pillow/test_image_dataset.py @@ -4,8 +4,8 @@ import pytest from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp from PIL import Image, ImageChops from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/plotly/test_json_dataset.py b/kedro-datasets/tests/plotly/test_json_dataset.py index ab6e17d9c..328fa5d88 100644 --- a/kedro-datasets/tests/plotly/test_json_dataset.py +++ b/kedro-datasets/tests/plotly/test_json_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER from s3fs.core import S3FileSystem from kedro_datasets.plotly import JSONDataSet diff --git a/kedro-datasets/tests/plotly/test_plotly_dataset.py b/kedro-datasets/tests/plotly/test_plotly_dataset.py index a422060e8..254a1dc7f 100644 --- a/kedro-datasets/tests/plotly/test_plotly_dataset.py +++ b/kedro-datasets/tests/plotly/test_plotly_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER from plotly import graph_objects from plotly.graph_objs import Scatter from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index 8b05a2025..46f23c9af 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -10,8 +10,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version, generate_timestamp from moto import mock_s3 from polars.testing import assert_frame_equal from s3fs.core import S3FileSystem diff --git a/kedro-datasets/tests/redis/test_redis_dataset.py b/kedro-datasets/tests/redis/test_redis_dataset.py index eaa8abbd2..ecc30af18 100644 --- a/kedro-datasets/tests/redis/test_redis_dataset.py +++ b/kedro-datasets/tests/redis/test_redis_dataset.py @@ -7,7 +7,7 @@ import pandas as pd import pytest import redis -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from pandas.testing import assert_frame_equal from kedro_datasets.redis import PickleDataSet diff --git a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py index 2133953b5..929e849e1 100644 --- a/kedro-datasets/tests/snowflake/test_snowpark_dataset.py +++ b/kedro-datasets/tests/snowflake/test_snowpark_dataset.py @@ -2,7 +2,7 @@ import os import pytest -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError try: import snowflake.snowpark as sp diff --git a/kedro-datasets/tests/spark/test_deltatable_dataset.py b/kedro-datasets/tests/spark/test_deltatable_dataset.py index 5cbbe62b7..651c2c876 100644 --- a/kedro-datasets/tests/spark/test_deltatable_dataset.py +++ b/kedro-datasets/tests/spark/test_deltatable_dataset.py @@ -1,6 +1,7 @@ import pytest from delta import DeltaTable -from kedro.io import DataCatalog, DataSetError +from kedro.io import DataCatalog +from kedro_datasets.io import DataSetError from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 74c5ee2bf..1c45e37e5 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -7,8 +7,9 @@ import boto3 import pandas as pd import pytest -from kedro.io import DataCatalog, DataSetError, Version -from kedro.io.core import generate_timestamp +from kedro.io import DataCatalog +from kedro_datasets.io import DataSetError, Version +from kedro_datasets.io.core import generate_timestamp from kedro.pipeline import node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline from kedro.runner import ParallelRunner, SequentialRunner diff --git a/kedro-datasets/tests/spark/test_spark_hive_dataset.py b/kedro-datasets/tests/spark/test_spark_hive_dataset.py index e0b8fc333..47cbc41a1 100644 --- a/kedro-datasets/tests/spark/test_spark_hive_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_hive_dataset.py @@ -4,7 +4,7 @@ from tempfile import TemporaryDirectory import pytest -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from psutil import Popen from pyspark import SparkContext from pyspark.sql import SparkSession diff --git a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py index 0f3d0e66b..f8e0949e0 100644 --- a/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_jdbc_dataset.py @@ -1,7 +1,7 @@ from unittest import mock import pytest -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from kedro_datasets.spark import SparkJDBCDataSet diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py index 26d421853..12928efb2 100644 --- a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py +++ b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py @@ -6,8 +6,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs import S3FileSystem diff --git a/kedro-datasets/tests/text/test_text_dataset.py b/kedro-datasets/tests/text/test_text_dataset.py index 733cc6c1f..8b09b17ad 100644 --- a/kedro-datasets/tests/text/test_text_dataset.py +++ b/kedro-datasets/tests/text/test_text_dataset.py @@ -4,8 +4,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.text import TextDataSet diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py index 62172b1a4..7c1a87924 100644 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ b/kedro-datasets/tests/tracking/test_json_dataset.py @@ -4,8 +4,8 @@ import pytest from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.tracking import JSONDataSet diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py index 2c1157de9..8e9c6796c 100644 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ b/kedro-datasets/tests/tracking/test_metrics_dataset.py @@ -4,8 +4,8 @@ import pytest from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from s3fs.core import S3FileSystem from kedro_datasets.tracking import MetricsDataSet diff --git a/kedro-datasets/tests/video/test_video_dataset.py b/kedro-datasets/tests/video/test_video_dataset.py index 1ac3d1ce4..c20c5aa65 100644 --- a/kedro-datasets/tests/video/test_video_dataset.py +++ b/kedro-datasets/tests/video/test_video_dataset.py @@ -1,6 +1,6 @@ import boto3 import pytest -from kedro.io import DataSetError +from kedro_datasets.io import DataSetError from moto import mock_s3 from utils import TEST_FPS, assert_videos_equal diff --git a/kedro-datasets/tests/yaml/test_yaml_dataset.py b/kedro-datasets/tests/yaml/test_yaml_dataset.py index 653606c17..00a72da1d 100644 --- a/kedro-datasets/tests/yaml/test_yaml_dataset.py +++ b/kedro-datasets/tests/yaml/test_yaml_dataset.py @@ -5,8 +5,8 @@ from fsspec.implementations.http import HTTPFileSystem from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem -from kedro.io import DataSetError -from kedro.io.core import PROTOCOL_DELIMITER, Version +from kedro_datasets.io import DataSetError +from kedro_datasets.io.core import PROTOCOL_DELIMITER, Version from pandas.testing import assert_frame_equal from s3fs.core import S3FileSystem From 389d34e634489c9c237b9b03a387840a71c76b7b Mon Sep 17 00:00:00 2001 From: brandonmeek Date: Thu, 9 Mar 2023 17:07:06 -0600 Subject: [PATCH 43/44] Removed `kedro~=0.18.4` as requirement, added `cachetools~=5.3` --- kedro-datasets/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-datasets/requirements.txt b/kedro-datasets/requirements.txt index b5edbb617..5e7b5e454 100644 --- a/kedro-datasets/requirements.txt +++ b/kedro-datasets/requirements.txt @@ -1 +1 @@ -kedro~=0.18.4 +cachetools~=5.3 \ No newline at end of file From 6cc3b099081148b7db82bb52bd0dd6e33df6170d Mon Sep 17 00:00:00 2001 From: brandonmeek Date: Thu, 9 Mar 2023 17:18:45 -0600 Subject: [PATCH 44/44] Updated RELEASE.md to reflect changes --- kedro-datasets/RELEASE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 3e108e7f4..640244c8e 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -11,6 +11,10 @@ | `polars.CSVDataSet` | A `CSVDataSet` backed by [polars](https://www.pola.rs/), a lighting fast dataframe package built entirely using Rust. | `kedro_datasets.polars` | | `snowflake.SnowparkTableDataSet` | Work with [Snowpark](https://www.snowflake.com/en/data-cloud/snowpark/) DataFrames from tables in Snowflake. | `kedro_datasets.snowflake` | +* Patched `kedro.io.core.py` and `kedro.utils` to `io.core.py` and `io.core.utils` respectively, +allowing for implementations of `AbstractDataSet` and `AbstractVersionedDataSet` to be shared with +and used by non-Kedro users. + ## Bug fixes and other changes * Add `mssql` backend to the `SQLQueryDataSet` DataSet using `pyodbc` library.