Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add metadata attribute to datasets #189

Merged
merged 22 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion kedro-datasets/RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
* Added pandas 2.0 support.
* Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4).
* Added a save method to the APIDataSet

* Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more.
* Relaxed Kedro version pin to `>=0.16`
* Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins.

## Bug fixes and other changes
* Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
Expand Down
18 changes: 10 additions & 8 deletions kedro-datasets/kedro_datasets/api/api_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@
from requests import Session, sessions
from requests.auth import AuthBase

# NOTE: kedro.extras.datasets will be removed in Kedro 0.19.0.
# Any contribution to datasets should be made in kedro-datasets
# in kedro-plugins (https://github.com/kedro-org/kedro-plugins)


class APIDataSet(AbstractDataSet[None, requests.Response]):
"""``APIDataSet`` loads/saves data from/to HTTP(S) APIs.
Expand All @@ -38,7 +34,7 @@ class APIDataSet(AbstractDataSet[None, requests.Response]):
Example usage for the `Python API <https://kedro.readthedocs.io/en/stable/data/\
data_catalog.html#use-the-data-catalog-with-the-code-api>`_: ::

>>> from kedro.extras.datasets.api import APIDataSet
>>> from kedro_datasets.api import APIDataSet
>>>
>>>
>>> data_set = APIDataSet(
Expand Down Expand Up @@ -99,6 +95,7 @@ def __init__(
load_args: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
credentials: Union[Tuple[str, str], List[str], AuthBase] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``APIDataSet`` to fetch data from an API endpoint.

Expand All @@ -108,12 +105,15 @@ def __init__(
methods
load_args: Additional parameters to be fed to requests.request.
https://requests.readthedocs.io/en/latest/api/#requests.request
credentials: Allows specifying secrets in credentials.yml.
Expected format is ``('login', 'password')`` if given as a tuple or
list. An ``AuthBase`` instance can be provided for more complex cases.
save_args: Options for saving data on server. Includes all parameters used
during load method. Adds an optional parameter, ``chunk_size`` which
determines the size of the package sent at each request.
credentials: Allows specifying secrets in credentials.yml.
Expected format is ``('login', 'password')`` if given as a tuple or list.
An ``AuthBase`` instance can be provided for more complex cases.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Raises:
ValueError: if both ``auth`` in ``load_args`` and ``credentials`` are
specified.
Expand Down Expand Up @@ -153,6 +153,8 @@ def __init__(
**self._params,
}

self.metadata = metadata
merelcht marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _convert_type(value: Any):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from kedro.io.core import AbstractDataSet, get_filepath_str, get_protocol_and_path


class BioSequenceDataSet(AbstractDataSet[List, List]):
class BioSequenceDataSet(
AbstractDataSet[List, List]
): # pylint:disable=too-many-instance-attributes
r"""``BioSequenceDataSet`` loads and saves data to a sequence file.

Example:
Expand Down Expand Up @@ -47,6 +49,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""
Creates a new instance of ``BioSequenceDataSet`` pointing
Expand All @@ -69,6 +72,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Note: Here you can find all supported file formats: https://biopython.org/wiki/SeqIO
"""
Expand Down Expand Up @@ -100,6 +105,8 @@ def __init__(
self._fs_open_args_load = _fs_open_args_load
self._fs_open_args_save = _fs_open_args_save

self.metadata = metadata

def _describe(self) -> Dict[str, Any]:
return {
"filepath": self._filepath,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/dask/parquet_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ParquetDataSet`` pointing to concrete
parquet files.
Expand All @@ -109,11 +110,15 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Optional parameters to the backend file system driver:
https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
self._filepath = filepath
self._fs_args = deepcopy(fs_args) or {}
self._credentials = deepcopy(credentials) or {}

self.metadata = metadata

# Handle default load and save arguments
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/email/message_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``EmailMessageDataSet`` pointing to a concrete text file
on a specific filesystem.
Expand Down Expand Up @@ -103,6 +104,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -116,6 +119,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/geopandas/geojson_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GeoJSONDataSet`` pointing to a concrete GeoJSON file
on a specific filesystem fsspec.
Expand Down Expand Up @@ -85,6 +86,8 @@ def __init__(
Here you can find all available arguments for `open`:
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `wb` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = copy.deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -97,6 +100,8 @@ def __init__(

self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/holoviews/holoviews_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
credentials: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
version: Version = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``HoloviewsWriter``.

Expand All @@ -70,6 +71,8 @@ def __init__(
``kedro.io.core.Version``. If its ``load`` attribute is
None, the latest version will be loaded. If its ``save``
attribute is None, save version will be autogenerated.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -83,6 +86,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/json/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet`` pointing to a concrete JSON file
on a specific filesystem.
Expand Down Expand Up @@ -86,6 +87,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -99,6 +102,8 @@ def __init__(
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/matplotlib/matplotlib_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def __init__(
save_args: Dict[str, Any] = None,
version: Version = None,
overwrite: bool = False,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``MatplotlibWriter``.

Expand All @@ -140,6 +141,8 @@ def __init__(
overwrite: If True, any existing image files will be removed.
Only relevant when saving multiple Matplotlib objects at
once.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_credentials = deepcopy(credentials) or {}
_fs_args = deepcopy(fs_args) or {}
Expand All @@ -153,6 +156,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/gml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GMLDataSet``.

Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/graphml_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``GraphMLDataSet``.

Expand All @@ -72,6 +73,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any arbitrary Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -85,6 +88,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/networkx/json_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``JSONDataSet``.

Expand All @@ -73,6 +74,8 @@ def __init__(
https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open
All defaults are preserved, except `mode`, which is set to `r` when loading
and to `w` when saving.
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
Expand All @@ -86,6 +89,8 @@ def __init__(
self._protocol = protocol
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/csv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``CSVDataSet`` pointing to a concrete CSV file
on a specific filesystem.
Expand All @@ -102,6 +103,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.
"""
_fs_args = deepcopy(fs_args) or {}
_credentials = deepcopy(credentials) or {}
Expand All @@ -114,6 +117,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
5 changes: 5 additions & 0 deletions kedro-datasets/kedro_datasets/pandas/excel_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def __init__(
version: Version = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
) -> None:
"""Creates a new instance of ``ExcelDataSet`` pointing to a concrete Excel file
on a specific filesystem.
Expand Down Expand Up @@ -150,6 +151,8 @@ def __init__(
E.g. for ``GCSFileSystem`` it should look like `{"token": None}`.
fs_args: Extra arguments to pass into underlying filesystem class constructor
(e.g. `{"project": "my-project"}` for ``GCSFileSystem``).
metadata: Any arbitrary metadata.
This is ignored by Kedro, but may be consumed by users or external plugins.

Raises:
DataSetError: If versioning is enabled while in append mode.
Expand All @@ -165,6 +168,8 @@ def __init__(
self._storage_options = {**_credentials, **_fs_args}
self._fs = fsspec.filesystem(self._protocol, **self._storage_options)

self.metadata = metadata

super().__init__(
filepath=PurePosixPath(path),
version=version,
Expand Down
Loading