Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add metadata attribute to datasets #189

Merged
merged 22 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
## Major features and improvements:
* Added pandas 2.0 support.
* Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
* Added `metadata` attribute to all existing datasets.
AhdraMeraliQB marked this conversation as resolved.
Show resolved Hide resolved
* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.

## Bug fixes and other changes
* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.

Expand Down
13 changes: 8 additions & 5 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@
from requests import Session, sessions
from requests.auth import AuthBase

# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
# Any contribution to datasets should be made in kedro-datasets
# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)


class APIDataSet(AbstractDataSet[None, requests.Response]):
"""``APIDataSet`` loads the data from HTTP(S) APIs.
Expand Down Expand Up @@ -39,7 +35,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
data_catalog.html#use-the-data-catalog-with-the-code-api>`_:
::

>>> from kedro.extras.datasets.api import APIDataSet
>>> from kedro_datasets.api import APIDataSet
>>>
>>>
>>> data_set = APIDataSet(
Expand All @@ -59,12 +55,14 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
>>> data = data_set.load()
"""

# pylint: disable=too-many-arguments
def __init__(
self,
url: str,
method: str = "GET",
load_args: Dict[str, Any] = None,
credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.

Expand All @@ -76,6 +74,9 @@ def __init__(
credentials: Allows specifying secrets in credentials.yml.
Expected format is ``('login', 'password')`` if given as a tuple or list.
An ``AuthBase`` instance can be provided for more complex cases.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Raises:
ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are specified.
"""
Expand All @@ -102,6 +103,8 @@ def __init__(
**self._load_args,
}

self._metadata = metadata

@staticmethod
def _convert_type(value: Any):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path


class BioSequenceDataSet(AbstractDataSet[List, List]):
class BioSequenceDataSet(
AbstractDataSet[List, List]
): # pylint:disable=too-many-instance-attributes
r"""``BioSequenceDataSet`` loads and saves data to a sequence file.

Example:
Expand Down Expand Up @@ -47,6 +49,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""
Creates a new instance of ``BioSequenceDataSet`` pointing
Expand All @@ -69,6 +72,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO
"""
Expand Down Expand Up @@ -100,6 +105,8 @@ def __init__(
self._fs_open_args_load = _fs_open_args_load
self._fs_open_args_save = _fs_open_args_save

self._metadata = metadata

def _describe(self) -> Dict[str, Any]:
return {
"filepath": self._filepath,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/dask/parquet_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ParquetDataSet`` pointing to concrete
parquet files.
Expand All @@ -109,11 +110,15 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Optional parameters to the backend file system driver:
https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
self._filepath = filepath
self._fs_args = deepcopy(fs_args) or {}
self._credentials = deepcopy(credentials) or {}

self._metadata = metadata

# Handle default load and save arguments
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/email/message_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``EmailMessageDataSet`` pointing to a concrete text file
on a specific filesystem.
Expand Down Expand Up @@ -103,6 +104,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -116,6 +119,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GeoJSONDataSet`` pointing to a concrete GeoJSON file
on a specific filesystem fsspec.
Expand Down Expand Up @@ -85,6 +86,8 @@ def __init__(
Here you can find all available arguments for `open`:
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `wb` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = copy.deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -97,6 +100,8 @@ def __init__(

self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
credentials: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
version: Version = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``HoloviewsWriter``.

Expand All @@ -70,6 +71,8 @@ def __init__(
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -83,6 +86,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/json/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
on a specific filesystem.
Expand Down Expand Up @@ -86,6 +87,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -99,6 +102,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
save_args: Dict[str, Any] = None,
version: Version = None,
overwrite: bool = False,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``MatplotlibWriter``.

Expand All @@ -140,6 +141,8 @@ def __init__(
overwrite: If True, any existing image files will be removed.
Only relevant when saving multiple Matplotlib objects at
once.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -153,6 +156,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/gml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GMLDataSet``.

Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GraphMLDataSet``.

Expand All @@ -72,6 +73,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -85,6 +88,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet``.

Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file
on a specific filesystem.
Expand All @@ -102,6 +103,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_credentials = deepcopy(credentials) or {}
Expand All @@ -114,6 +117,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ExcelDataSet`` pointing to a concrete Excel file
on a specific filesystem.
Expand Down Expand Up @@ -150,6 +151,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Raises:
DataSetError: If versioning is enabled while in append mode.
Expand All @@ -165,6 +168,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self._metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
Loading