From f59e93013367c7a3808db3f9296e3cab1378a5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Luis=20Cano=20Rodr=C3=ADguez?= Date: Mon, 13 Nov 2023 14:32:31 +0100 Subject: [PATCH] feat(datasets): Add Hugging Face datasets (#344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add HuggingFace datasets Co-authored-by: Danny Farah Co-authored-by: Kevin Koga Co-authored-by: Mate Scharnitzky Co-authored-by: Tomer Shor Co-authored-by: Pierre-Yves Mousset Co-authored-by: Bela Chupal Co-authored-by: Khangjrakpam Arjun Co-authored-by: Juan Luis Cano Rodríguez Signed-off-by: Juan Luis Cano Rodríguez * Apply suggestions from code review Signed-off-by: Juan Luis Cano Rodríguez Co-authored-by: Joel <35801847+datajoely@users.noreply.github.com> Co-authored-by: Nok Lam Chan * Typo Signed-off-by: Juan Luis Cano Rodríguez * Fix docstring Signed-off-by: Juan Luis Cano Rodríguez * Add docstring for HFTransformerPipelineDataset Signed-off-by: Juan Luis Cano Rodríguez * Use intersphinx for cross references in Hugging Face docstrings Signed-off-by: Juan Luis Cano Rodríguez * Add docstring for HFDataset Signed-off-by: Juan Luis Cano Rodríguez * Add missing test dependencies Signed-off-by: Juan Luis Cano Rodríguez * Add tests for huggingface datasets Signed-off-by: Juan Luis Cano Rodríguez * Fix HFDataset.save Signed-off-by: Juan Luis Cano Rodríguez * Add test for HFDataset.list_datasets Signed-off-by: Juan Luis Cano Rodríguez * Use new name Signed-off-by: Juan Luis Cano Rodríguez * Consolidate imports Signed-off-by: Juan Luis Cano Rodríguez --------- Signed-off-by: Juan Luis Cano Rodríguez Co-authored-by: Danny Farah Co-authored-by: Kevin Koga Co-authored-by: Mate Scharnitzky Co-authored-by: Tomer Shor Co-authored-by: Pierre-Yves Mousset Co-authored-by: Bela Chupal Co-authored-by: Khangjrakpam Arjun Co-authored-by: Joel <35801847+datajoely@users.noreply.github.com> Co-authored-by: Nok Lam Chan --- kedro-datasets/docs/source/conf.py | 4 ++ kedro-datasets/docs/source/kedro_datasets.rst | 2 + .../kedro_datasets/huggingface/__init__.py | 16 +++++ .../huggingface/hugging_face_dataset.py | 56 +++++++++++++++ .../transformer_pipeline_dataset.py | 71 +++++++++++++++++++ kedro-datasets/setup.py | 9 +++ kedro-datasets/tests/huggingface/__init__.py | 0 kedro-datasets/tests/huggingface/conftest.py | 6 ++ .../huggingface/test_hugging_face_dataset.py | 33 +++++++++ .../test_transformer_pipeline_dataset.py | 65 +++++++++++++++++ 10 files changed, 262 insertions(+) create mode 100644 kedro-datasets/kedro_datasets/huggingface/__init__.py create mode 100644 kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py create mode 100644 kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py create mode 100644 kedro-datasets/tests/huggingface/__init__.py create mode 100644 kedro-datasets/tests/huggingface/conftest.py create mode 100644 kedro-datasets/tests/huggingface/test_hugging_face_dataset.py create mode 100644 kedro-datasets/tests/huggingface/test_transformer_pipeline_dataset.py diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index 397d23c53..44cbb1b31 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -43,6 +43,7 @@ extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", "sphinx.ext.napoleon", "sphinx_autodoc_typehints", "sphinx.ext.doctest", @@ -90,6 +91,9 @@ "kedro_docs_style_guide.md", ] +intersphinx_mapping = { + "kedro": ("https://docs.kedro.org/en/stable/", None), +} type_targets = { "py:class": ( diff --git a/kedro-datasets/docs/source/kedro_datasets.rst b/kedro-datasets/docs/source/kedro_datasets.rst index 3091b3c4a..4eee78b5b 100644 --- a/kedro-datasets/docs/source/kedro_datasets.rst +++ b/kedro-datasets/docs/source/kedro_datasets.rst @@ -24,6 +24,8 @@ kedro_datasets kedro_datasets.geopandas.GeoJSONDataSet kedro_datasets.geopandas.GeoJSONDataset kedro_datasets.holoviews.HoloviewsWriter + kedro_datasets.huggingface.HFDataset + kedro_datasets.huggingface.HFTransformerPipelineDataset kedro_datasets.json.JSONDataSet kedro_datasets.json.JSONDataset kedro_datasets.matplotlib.MatplotlibWriter diff --git a/kedro-datasets/kedro_datasets/huggingface/__init__.py b/kedro-datasets/kedro_datasets/huggingface/__init__.py new file mode 100644 index 000000000..74b6cc927 --- /dev/null +++ b/kedro-datasets/kedro_datasets/huggingface/__init__.py @@ -0,0 +1,16 @@ +"""Provides interface to Hugging Face transformers and datasets.""" +from typing import Any + +import lazy_loader as lazy + +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +HFDataset: Any +HFTransformerPipelineDataset: Any + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, + submod_attrs={ + "hugging_face_dataset": ["HFDataset"], + "transformer_pipeline_dataset": ["HFTransformerPipelineDataset"], + }, +) diff --git a/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py b/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py new file mode 100644 index 000000000..23f38a2e8 --- /dev/null +++ b/kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from typing import Any + +from datasets import load_dataset +from huggingface_hub import HfApi +from kedro.io import AbstractVersionedDataset + + +class HFDataset(AbstractVersionedDataset): + """``HFDataset`` loads Hugging Face datasets + using the `datasets `_ library. + + Example usage for the :doc:`YAML API `: + + .. code-block:: yaml + + yelp_reviews: + type: kedro_hf_datasets.HFDataset + dataset_name: yelp_review_full + + Example usage for the :doc:`Python API `: + + .. code-block:: pycon + + >>> from kedro_datasets.huggingface import HFDataset + >>> dataset = HFDataset(dataset_name="yelp_review_full") + >>> yelp_review_full = dataset.load() + >>> assert "train" in yelp_review_full + >>> assert "test" in yelp_review_full + >>> assert len(yelp_review_full["train"]) == 650000 + + """ + + def __init__(self, dataset_name: str): + self.dataset_name = dataset_name + + def _load(self): + return load_dataset(self.dataset_name) + + def _save(self): + raise NotImplementedError("Not yet implemented") + + def _describe(self) -> dict[str, Any]: + api = HfApi() + dataset_info = list(api.list_datasets(search=self.dataset_name))[0] + return { + "dataset_name": self.dataset_name, + "dataset_tags": dataset_info.tags, + "dataset_author": dataset_info.author, + } + + @staticmethod + def list_datasets(): + api = HfApi() + return list(api.list_datasets()) diff --git a/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py b/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py new file mode 100644 index 000000000..8e94a7c03 --- /dev/null +++ b/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import typing as t +from warnings import warn + +from kedro.io import AbstractDataset +from transformers import Pipeline, pipeline + + +class HFTransformerPipelineDataset(AbstractDataset): + """``HFTransformerPipelineDataset`` loads pretrained Hugging Face transformers + using the `transformers `_ library. + + Example usage for the :doc:`YAML API `: + + .. code-block:: yaml + + summarizer_model: + type: huggingface.HFTransformerPipelineDataset + task: summarization + + fill_mask_model: + type: huggingface.HFTransformerPipelineDataset + task: fill-mask + model_name: Twitter/twhin-bert-base + + Example usage for the :doc:`Python API `: + + .. code-block:: pycon + + >>> from kedro_datasets.huggingface import HFTransformerPipelineDataset + >>> dataset = HFTransformerPipelineDataset(task="text-classification", model_name="papluca/xlm-roberta-base-language-detection") + >>> detector = dataset.load() + >>> assert detector("Ceci n'est pas une pipe")[0]["label"] == "fr" + + """ + + def __init__( + self, + task: str | None = None, + model_name: str | None = None, + pipeline_kwargs: dict[t.Any] | None = None, + ): + if task is None and model_name is None: + raise ValueError("At least 'task' or 'model_name' are needed") + self._task = task if task else None + self._model_name = model_name + self._pipeline_kwargs = pipeline_kwargs or {} + + if self._pipeline_kwargs and ( + "task" in self._pipeline_kwargs or "model" in self._pipeline_kwargs + ): + warn( + "Specifying 'task' or 'model' in 'pipeline_kwargs' is not allowed", + UserWarning, + ) + self._pipeline_kwargs.pop("task", None) + self._pipeline_kwargs.pop("model", None) + + def _load(self) -> Pipeline: + return pipeline(self._task, model=self._model_name, **self._pipeline_kwargs) + + def _save(self, pipeline: Pipeline) -> None: + raise NotImplementedError("Not yet implemented") + + def _describe(self) -> dict[str, t.Any]: + return { + "task": self._task, + "model_name": self._model_name, + "pipeline_kwargs": self._pipeline_kwargs, + } diff --git a/kedro-datasets/setup.py b/kedro-datasets/setup.py index 016919e91..b8d41323e 100644 --- a/kedro-datasets/setup.py +++ b/kedro-datasets/setup.py @@ -25,6 +25,10 @@ def _collect_requirements(requires): "geopandas.GeoJSONDataSet": ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"] } holoviews_require = {"holoviews.HoloviewsWriter": ["holoviews~=1.13.0"]} +huggingface_require = { + "huggingface.HFDataset": ["datasets", "huggingface_hub"], + "huggingface.HFTransformerPipelineDataset": ["transformers"], +} matplotlib_require = {"matplotlib.MatplotlibWriter": ["matplotlib>=3.0.3, <4.0"]} networkx_require = {"networkx.NetworkXDataSet": ["networkx~=2.4"]} pandas_require = { @@ -102,6 +106,7 @@ def _collect_requirements(requires): "databricks": _collect_requirements(databricks_require), "geopandas": _collect_requirements(geopandas_require), "holoviews": _collect_requirements(holoviews_require), + "huggingface": _collect_requirements(huggingface_require), "matplotlib": _collect_requirements(matplotlib_require), "networkx": _collect_requirements(networkx_require), "pandas": _collect_requirements(pandas_require), @@ -224,6 +229,10 @@ def _collect_requirements(requires): "triad>=0.6.7, <1.0", "trufflehog~=2.1", "xlsxwriter~=1.0", + # huggingface + "datasets", + "huggingface_hub", + "transformers", ] setup( diff --git a/kedro-datasets/tests/huggingface/__init__.py b/kedro-datasets/tests/huggingface/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/kedro-datasets/tests/huggingface/conftest.py b/kedro-datasets/tests/huggingface/conftest.py new file mode 100644 index 000000000..8630b0dbb --- /dev/null +++ b/kedro-datasets/tests/huggingface/conftest.py @@ -0,0 +1,6 @@ +""" +This file contains the fixtures that are reusable by any tests within +this directory. You don't need to import the fixtures as pytest will +discover them automatically. More info here: +https://docs.pytest.org/en/latest/fixture.html +""" diff --git a/kedro-datasets/tests/huggingface/test_hugging_face_dataset.py b/kedro-datasets/tests/huggingface/test_hugging_face_dataset.py new file mode 100644 index 000000000..909362ec2 --- /dev/null +++ b/kedro-datasets/tests/huggingface/test_hugging_face_dataset.py @@ -0,0 +1,33 @@ +import pytest +from huggingface_hub import HfApi + +from kedro_datasets.huggingface import HFDataset + + +@pytest.fixture +def dataset_name(): + return "yelp_review_full" + + +class TestHFDataset: + def test_simple_dataset_load(self, dataset_name, mocker): + mocked_load_dataset = mocker.patch( + "kedro_datasets.huggingface.hugging_face_dataset.load_dataset" + ) + + dataset = HFDataset( + dataset_name=dataset_name, + ) + hf_ds = dataset.load() + + mocked_load_dataset.assert_called_once_with(dataset_name) + assert hf_ds is mocked_load_dataset.return_value + + def test_list_datasets(self, mocker): + expected_datasets = ["dataset_1", "dataset_2"] + mocked_hf_list_datasets = mocker.patch.object(HfApi, "list_datasets") + mocked_hf_list_datasets.return_value = expected_datasets + + datasets = HFDataset.list_datasets() + + assert datasets == expected_datasets diff --git a/kedro-datasets/tests/huggingface/test_transformer_pipeline_dataset.py b/kedro-datasets/tests/huggingface/test_transformer_pipeline_dataset.py new file mode 100644 index 000000000..9201f9f83 --- /dev/null +++ b/kedro-datasets/tests/huggingface/test_transformer_pipeline_dataset.py @@ -0,0 +1,65 @@ +import pytest + +from kedro_datasets.huggingface import HFTransformerPipelineDataset + + +@pytest.fixture +def task(): + return "fill-mask" + + +@pytest.fixture +def model_name(): + return "Twitter/twhin-bert-base" + + +class TestHFTransformerPipelineDataset: + def test_simple_dataset_load(self, task, model_name, mocker): + mocked_pipeline = mocker.patch( + "kedro_datasets.huggingface.transformer_pipeline_dataset.pipeline" + ) + + dataset = HFTransformerPipelineDataset( + task=task, + model_name=model_name, + ) + model = dataset.load() + + mocked_pipeline.assert_called_once_with(task, model=model_name) + assert model is mocked_pipeline.return_value + + def test_dataset_pipeline_kwargs_load(self, task, model_name, mocker): + mocked_pipeline = mocker.patch( + "kedro_datasets.huggingface.transformer_pipeline_dataset.pipeline" + ) + + pipeline_kwargs = {"foo": True} + dataset = HFTransformerPipelineDataset( + task=task, + model_name=model_name, + pipeline_kwargs=pipeline_kwargs, + ) + model = dataset.load() + + mocked_pipeline.assert_called_once_with( + task, model=model_name, **pipeline_kwargs + ) + assert model is mocked_pipeline.return_value + + def test_dataset_redundant_pipeline_kwargs(self, task, model_name, mocker): + pipeline_kwargs = {"task": "redundant"} + with pytest.warns( + UserWarning, + match="Specifying 'task' or 'model' in 'pipeline_kwargs' is not allowed", + ): + HFTransformerPipelineDataset( + task=task, + model_name=model_name, + pipeline_kwargs=pipeline_kwargs, + ) + + def test_dataset_incomplete_init(self): + with pytest.raises( + ValueError, match="At least 'task' or 'model_name' are needed" + ): + HFTransformerPipelineDataset()