From a92b911fe69016a30090f502af44cd8a5053cc2a Mon Sep 17 00:00:00 2001 From: Greg Vaslowski <7269272+Vaslo@users.noreply.github.com> Date: Thu, 10 Oct 2024 06:03:44 -0400 Subject: [PATCH 01/19] Update index.md (#4221) Fixed an erroneous link to the Get started with Kedro - Create your first data pipeline with Kedro video. It was accidentally linked to the previous video. Signed-off-by: Greg Vaslowski <7269272+Vaslo@users.noreply.github.com> Signed-off-by: Ankita Katiyar --- docs/source/course/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/course/index.md b/docs/source/course/index.md index 8b3f49c14f..7d9b9100e3 100644 --- a/docs/source/course/index.md +++ b/docs/source/course/index.md @@ -62,7 +62,7 @@ You don't need to register for the course and you can skip around the sections t 1. [Set up the Kedro Data Catalog](https://www.youtube.com/watch?v=rl2cncGxyts) 1. [Explore the spaceflights data](https://www.youtube.com/watch?v=bZD8N0yv3Fs) 1. [Refactor your data processing code into functions](https://www.youtube.com/watch?v=VFcrvnnNas4) -1. [Create your first data pipeline with Kedro](https://www.youtube.com/watch?v=VFcrvnnNas4) +1. [Create your first data pipeline with Kedro](https://www.youtube.com/watch?v=3YeE_gvDCvw) 1. [Assemble your nodes into a Kedro pipeline](https://www.youtube.com/watch?v=P__gFG1TmMo) 1. [Run your Kedro pipeline](https://www.youtube.com/watch?v=sll_LhZE-p8) 1. [Visualise your data pipeline with Kedro-Viz](https://www.youtube.com/watch?v=KWqSzbHgNW4) From e863f1690e1ddfee6f959774affa933a69b08042 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:49:16 +0100 Subject: [PATCH 02/19] Bump kedro-sphinx-theme from 2024.4.0 to 2024.10.0 (#4216) * Bump kedro-sphinx-theme from 2024.4.0 to 2024.10.0 Bumps [kedro-sphinx-theme](https://github.com/kedro-org/kedro-sphinx-theme) from 2024.4.0 to 2024.10.0. - [Release notes](https://github.com/kedro-org/kedro-sphinx-theme/releases) - [Commits](https://github.com/kedro-org/kedro-sphinx-theme/compare/v2024.4.0...v2024.10.0) --- updated-dependencies: - dependency-name: kedro-sphinx-theme dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * updated to 2024.10.2 * trigger_run * trigger_run --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Co-authored-by: rashidakanchwala Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Signed-off-by: Ankita Katiyar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 81eb4e301a..6f8e44f7ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ test = [ docs = [ "ipykernel>=5.3, <7.0", "Jinja2<3.2.0", - "kedro-sphinx-theme==2024.4.0", + "kedro-sphinx-theme==2024.10.2", "sphinx-notfound-page!=1.0.3", # Required by kedro-sphinx-theme. 1.0.3 results in `AttributeError: 'tuple' object has no attribute 'default'`. ] jupyter = [ From 2ccba38e20009dd1055fd2a4fc56f3c7366d9e8e Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 10 Oct 2024 07:05:20 -0600 Subject: [PATCH 03/19] Replace all instances of "data set" with "dataset" (#4211) Signed-off-by: Deepyaman Datta Signed-off-by: Ankita Katiyar --- docs/source/data/data_catalog.md | 2 +- docs/source/integrations/mlflow.md | 2 +- .../nodes_and_pipelines/run_a_pipeline.md | 6 +- .../tutorial/spaceflights_tutorial_faqs.md | 6 +- .../conf/base/catalog.yml | 6 +- .../conf/local/credentials.yml | 2 +- .../pipelines/data_engineering/nodes.py | 2 +- kedro/io/__init__.py | 2 +- kedro/io/catalog_config_resolver.py | 2 +- kedro/io/core.py | 54 ++++++------ kedro/io/data_catalog.py | 88 +++++++++---------- kedro/io/lambda_dataset.py | 14 +-- kedro/io/memory_dataset.py | 2 +- kedro/pipeline/node.py | 4 +- kedro/pipeline/pipeline.py | 8 +- kedro/runner/parallel_runner.py | 12 +-- kedro/runner/sequential_runner.py | 2 +- .../conf/base/catalog.yml | 2 +- .../conf/local/credentials.yml | 2 +- tests/io/test_core.py | 8 +- tests/io/test_data_catalog.py | 52 +++++------ tests/io/test_kedro_data_catalog.py | 6 +- tests/io/test_lambda_dataset.py | 6 +- tests/pipeline/test_pipeline_from_missing.py | 2 +- 24 files changed, 146 insertions(+), 146 deletions(-) diff --git a/docs/source/data/data_catalog.md b/docs/source/data/data_catalog.md index 568e66ee4f..3edb94632a 100644 --- a/docs/source/data/data_catalog.md +++ b/docs/source/data/data_catalog.md @@ -200,7 +200,7 @@ cars: In this example, `filepath` is used as the basis of a folder that stores versions of the `cars` dataset. Each time a new version is created by a pipeline run it is stored within `data/01_raw/company/cars.csv//cars.csv`, where `` corresponds to a version string formatted as `YYYY-MM-DDThh.mm.ss.sssZ`. -By default, `kedro run` loads the latest version of the dataset. However, you can also specify a particular versioned data set with `--load-version` flag as follows: +By default, `kedro run` loads the latest version of the dataset. However, you can also specify a particular versioned dataset with `--load-version` flag as follows: ```bash kedro run --load-versions=cars:YYYY-MM-DDThh.mm.ss.sssZ diff --git a/docs/source/integrations/mlflow.md b/docs/source/integrations/mlflow.md index e2d06a0295..78d3df6c69 100644 --- a/docs/source/integrations/mlflow.md +++ b/docs/source/integrations/mlflow.md @@ -134,7 +134,7 @@ and you would be able to preview it in the MLflow web UI: ``` :::{warning} -If you get a `Failed while saving data to data set MlflowMatplotlibWriter` error, +If you get a `Failed while saving data to dataset MlflowMatplotlibWriter` error, it's probably because you had already executed `kedro run` while the dataset was marked as `versioned: true`. The solution is to cleanup the old `data/08_reporting/dummy_confusion_matrix.png` directory. ::: diff --git a/docs/source/nodes_and_pipelines/run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md index 4eaa06c296..2bf1a99383 100644 --- a/docs/source/nodes_and_pipelines/run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -70,13 +70,13 @@ class DryRunner(AbstractRunner): """ def create_default_dataset(self, ds_name: str) -> AbstractDataset: - """Factory method for creating the default data set for the runner. + """Factory method for creating the default dataset for the runner. Args: - ds_name: Name of the missing data set + ds_name: Name of the missing dataset Returns: An instance of an implementation of AbstractDataset to be used - for all unregistered data sets. + for all unregistered datasets. """ return MemoryDataset() diff --git a/docs/source/tutorial/spaceflights_tutorial_faqs.md b/docs/source/tutorial/spaceflights_tutorial_faqs.md index ff09d0ae91..ab6d7b8020 100644 --- a/docs/source/tutorial/spaceflights_tutorial_faqs.md +++ b/docs/source/tutorial/spaceflights_tutorial_faqs.md @@ -7,11 +7,11 @@ If you can't find the answer you need here, [ask the Kedro community for help](h ## How do I resolve these common errors? ### Dataset errors -#### DatasetError: Failed while loading data from data set +#### DatasetError: Failed while loading data from dataset You're [testing whether Kedro can load the raw test data](./set_up_data.md#test-that-kedro-can-load-the-data) and see the following: ```python -DatasetError: Failed while loading data from data set +DatasetError: Failed while loading data from dataset CSVDataset(filepath=...). [Errno 2] No such file or directory: '.../companies.csv' ``` @@ -71,6 +71,6 @@ The above exception was the direct cause of the following exception: Traceback (most recent call last): ... raise DatasetError(message) from exc -kedro.io.core.DatasetError: Failed while loading data from data set CSVDataset(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). +kedro.io.core.DatasetError: Failed while loading data from dataset CSVDataset(filepath=data/03_primary/model_input_table.csv, save_args={'index': False}). [Errno 2] File b'data/03_primary/model_input_table.csv' does not exist: b'data/03_primary/model_input_table.csv' ``` diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index 62280524bd..32da2376b3 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,11 +1,11 @@ -# Here you can define all your data sets by using simple YAML syntax. +# Here you can define all your datasets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://docs.kedro.org/en/stable/data/data_catalog.html # # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS # -# An example data set definition can look as follows: +# An example dataset definition can look as follows: # #bikes: # type: pandas.CSVDataset @@ -39,7 +39,7 @@ # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: # https://docs.kedro.org/en/stable/data/data_catalog.html # -# This is a data set used by the "Hello World" example pipeline provided with the project +# This is a dataset used by the "Hello World" example pipeline provided with the project # template. Please feel free to remove it once you remove the example pipeline. example_iris_data: diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml index 7fce832f6a..753fe237ed 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/conf/local/credentials.yml @@ -1,4 +1,4 @@ -# Here you can define credentials for different data sets and environment. +# Here you can define credentials for different datasets and environment. # # # Example: diff --git a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py index 024ea394ed..c492614c33 100644 --- a/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py +++ b/features/steps/test_starter/{{ cookiecutter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/data_engineering/nodes.py @@ -11,7 +11,7 @@ def split_data(data: pd.DataFrame, example_test_data_ratio: float) -> dict[str, Any]: - """Node for splitting the classical Iris data set into training and test + """Node for splitting the classical Iris dataset into training and test sets, each split into features and labels. The split ratio parameter is taken from conf/project/parameters.yml. The data and the parameters will be loaded and provided to your function diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 9697e1bd35..6384fd6138 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,5 +1,5 @@ """``kedro.io`` provides functionality to read and write to a -number of data sets. At the core of the library is the ``AbstractDataset`` class. +number of datasets. At the core of the library is the ``AbstractDataset`` class. """ from __future__ import annotations diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index dc55d18b3c..f722bedb6e 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -90,7 +90,7 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An The set of requested credentials. Raises: - KeyError: When a data set with the given name has not yet been + KeyError: When a dataset with the given name has not yet been registered. """ diff --git a/kedro/io/core.py b/kedro/io/core.py index 53b660835c..981e81ccd7 100644 --- a/kedro/io/core.py +++ b/kedro/io/core.py @@ -71,7 +71,7 @@ class DatasetError(Exception): class DatasetNotFoundError(DatasetError): """``DatasetNotFoundError`` raised by ``DataCatalog`` class in case of - trying to use a non-existing data set. + trying to use a non-existing dataset. """ pass @@ -79,7 +79,7 @@ class DatasetNotFoundError(DatasetError): class DatasetAlreadyExistsError(DatasetError): """``DatasetAlreadyExistsError`` raised by ``DataCatalog`` class in case - of trying to add a data set which already exists in the ``DataCatalog``. + of trying to add a dataset which already exists in the ``DataCatalog``. """ pass @@ -87,7 +87,7 @@ class DatasetAlreadyExistsError(DatasetError): class VersionNotFoundError(DatasetError): """``VersionNotFoundError`` raised by ``AbstractVersionedDataset`` implementations - in case of no load versions available for the data set. + in case of no load versions available for the dataset. """ pass @@ -98,9 +98,9 @@ class VersionNotFoundError(DatasetError): class AbstractDataset(abc.ABC, Generic[_DI, _DO]): - """``AbstractDataset`` is the base class for all data set implementations. + """``AbstractDataset`` is the base class for all dataset implementations. - All data set implementations should extend this abstract class + All dataset implementations should extend this abstract class and implement the methods marked as abstract. If a specific dataset implementation cannot be used in conjunction with the ``ParallelRunner``, such user-defined dataset should have the @@ -156,23 +156,23 @@ def from_config( load_version: str | None = None, save_version: str | None = None, ) -> AbstractDataset: - """Create a data set instance using the configuration provided. + """Create a dataset instance using the configuration provided. Args: name: Data set name. config: Data set config dictionary. load_version: Version string to be used for ``load`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. save_version: Version string to be used for ``save`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. Returns: An instance of an ``AbstractDataset`` subclass. Raises: - DatasetError: When the function fails to create the data set + DatasetError: When the function fails to create the dataset from its config. """ @@ -245,9 +245,9 @@ def load(self: Self) -> _DO: except DatasetError: raise except Exception as exc: - # This exception handling is by design as the composed data sets + # This exception handling is by design as the composed datasets # can throw any type of exception. - message = f"Failed while loading data from data set {self!s}.\n{exc!s}" + message = f"Failed while loading data from dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc load.__annotations__["return"] = load_func.__annotations__.get("return") @@ -271,7 +271,7 @@ def save(self: Self, data: _DI) -> None: except (DatasetError, FileNotFoundError, NotADirectoryError): raise except Exception as exc: - message = f"Failed while saving data to data set {self!s}.\n{exc!s}" + message = f"Failed while saving data to dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc save.__annotations__["data"] = save_func.__annotations__.get("data", Any) @@ -377,7 +377,7 @@ def _describe(self) -> dict[str, Any]: ) def exists(self) -> bool: - """Checks whether a data set's output already exists by calling + """Checks whether a dataset's output already exists by calling the provided _exists() method. Returns: @@ -391,7 +391,7 @@ def exists(self) -> bool: self._logger.debug("Checking whether target of %s exists", str(self)) return self._exists() except Exception as exc: - message = f"Failed during exists check for data set {self!s}.\n{exc!s}" + message = f"Failed during exists check for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _exists(self) -> bool: @@ -412,7 +412,7 @@ def release(self) -> None: self._logger.debug("Releasing %s", str(self)) self._release() except Exception as exc: - message = f"Failed during release for data set {self!s}.\n{exc!s}" + message = f"Failed during release for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _release(self) -> None: @@ -438,7 +438,7 @@ def generate_timestamp() -> str: class Version(namedtuple("Version", ["load", "save"])): """This namedtuple is used to provide load and save versions for versioned - data sets. If ``Version.load`` is None, then the latest available version + datasets. If ``Version.load`` is None, then the latest available version is loaded. If ``Version.save`` is None, then save version is formatted as YYYY-MM-DDThh.mm.ss.sssZ of the current timestamp. """ @@ -450,7 +450,7 @@ class Version(namedtuple("Version", ["load", "save"])): "Save version '{}' did not match load version '{}' for {}. This is strongly " "discouraged due to inconsistencies it may cause between 'save' and " "'load' operations. Please refrain from setting exact load version for " - "intermediate data sets where possible to avoid this warning." + "intermediate datasets where possible to avoid this warning." ) _DEFAULT_PACKAGES = ["kedro.io.", "kedro_datasets.", ""] @@ -467,10 +467,10 @@ def parse_dataset_definition( config: Data set config dictionary. It *must* contain the `type` key with fully qualified class name or the class object. load_version: Version string to be used for ``load`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. save_version: Version string to be used for ``save`` operation if - the data set is versioned. Has no effect on the data set + the dataset is versioned. Has no effect on the dataset if versioning was not enabled. Raises: @@ -522,14 +522,14 @@ def parse_dataset_definition( if not issubclass(class_obj, AbstractDataset): raise DatasetError( f"Dataset type '{class_obj.__module__}.{class_obj.__qualname__}' " - f"is invalid: all data set types must extend 'AbstractDataset'." + f"is invalid: all dataset types must extend 'AbstractDataset'." ) if VERSION_KEY in config: # remove "version" key so that it's not passed - # to the "unversioned" data set constructor + # to the "unversioned" dataset constructor message = ( - "'%s' attribute removed from data set configuration since it is a " + "'%s' attribute removed from dataset configuration since it is a " "reserved word and cannot be directly specified" ) logging.getLogger(__name__).warning(message, VERSION_KEY) @@ -579,10 +579,10 @@ def _local_exists(local_filepath: str) -> bool: # SKIP_IF_NO_SPARK class AbstractVersionedDataset(AbstractDataset[_DI, _DO], abc.ABC): """ - ``AbstractVersionedDataset`` is the base class for all versioned data set + ``AbstractVersionedDataset`` is the base class for all versioned dataset implementations. - All data sets that implement versioning should extend this + All datasets that implement versioning should extend this abstract class and implement the methods marked as abstract. Example: @@ -764,7 +764,7 @@ def save(self: Self, data: _DI) -> None: return save def exists(self) -> bool: - """Checks whether a data set's output already exists by calling + """Checks whether a dataset's output already exists by calling the provided _exists() method. Returns: @@ -780,7 +780,7 @@ def exists(self) -> bool: except VersionNotFoundError: return False except Exception as exc: # SKIP_IF_NO_SPARK - message = f"Failed during exists check for data set {self!s}.\n{exc!s}" + message = f"Failed during exists check for dataset {self!s}.\n{exc!s}" raise DatasetError(message) from exc def _release(self) -> None: @@ -938,7 +938,7 @@ def add_feed_dict(self, datasets: dict[str, Any], replace: bool = False) -> None ... def exists(self, name: str) -> bool: - """Checks whether registered data set exists by calling its `exists()` method.""" + """Checks whether registered dataset exists by calling its `exists()` method.""" ... def release(self, name: str) -> None: diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a010f3e852..6f9a678272 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -2,7 +2,7 @@ provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a single point of reference for your calls, -relaying load and save functions to the underlying data sets. +relaying load and save functions to the underlying datasets. """ from __future__ import annotations @@ -35,10 +35,10 @@ def _sub_nonword_chars(dataset_name: str) -> str: - """Replace non-word characters in data set names since Kedro 0.16.2. + """Replace non-word characters in dataset names since Kedro 0.16.2. Args: - dataset_name: The data set name registered in the data catalog. + dataset_name: The dataset name registered in the data catalog. Returns: The name used in `DataCatalog.datasets`. @@ -102,9 +102,9 @@ class DataCatalog: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with - a dictionary of data sets. Then it will act as a single point of reference + a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save functions - to the underlying data sets. + to the underlying datasets. """ def __init__( # noqa: PLR0913 @@ -120,15 +120,15 @@ def __init__( # noqa: PLR0913 """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to - instantiate it with a dictionary of data sets. Then it will act as a + instantiate it with a dictionary of datasets. Then it will act as a single point of reference for your calls, relaying load and save - functions to the underlying data sets. + functions to the underlying datasets. Args: - datasets: A dictionary of data set names and data set instances. + datasets: A dictionary of dataset names and dataset instances. feed_dict: A feed dict with data to be added in memory. - dataset_patterns: A dictionary of data set factory patterns - and corresponding data set configuration. When fetched from catalog configuration + dataset_patterns: A dictionary of dataset factory patterns + and corresponding dataset configuration. When fetched from catalog configuration these patterns will be sorted by: 1. Decreasing specificity (number of characters outside the curly brackets) 2. Decreasing number of placeholders (number of curly bracket pairs) @@ -137,10 +137,10 @@ def __init__( # noqa: PLR0913 pattern provided through the runners if it comes before "default" in the alphabet. Such an overwriting pattern will emit a warning. The `"{default}"` name will not emit a warning. - load_versions: A mapping between data set names and versions - to load. Has no effect on data sets without enabled versioning. + load_versions: A mapping between dataset names and versions + to load. Has no effect on datasets without enabled versioning. save_version: Version string to be used for ``save`` operations - by all data sets with enabled versioning. It must: a) be a + by all datasets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. @@ -216,28 +216,28 @@ def from_config( ``DataCatalog`` with configuration parsed from configuration files. Args: - catalog: A dictionary whose keys are the data set names and + catalog: A dictionary whose keys are the dataset names and the values are dictionaries with the constructor arguments - for classes implementing ``AbstractDataset``. The data set + for classes implementing ``AbstractDataset``. The dataset class to be loaded is specified with the key ``type`` and their - fully qualified class name. All ``kedro.io`` data set can be + fully qualified class name. All ``kedro.io`` dataset can be specified by their class name only, i.e. their module name can be omitted. credentials: A dictionary containing credentials for different - data sets. Use the ``credentials`` key in a ``AbstractDataset`` + datasets. Use the ``credentials`` key in a ``AbstractDataset`` to refer to the appropriate credentials as shown in the example below. load_versions: A mapping between dataset names and versions - to load. Has no effect on data sets without enabled versioning. + to load. Has no effect on datasets without enabled versioning. save_version: Version string to be used for ``save`` operations - by all data sets with enabled versioning. It must: a) be a + by all datasets with enabled versioning. It must: a) be a case-insensitive string that conforms with operating system filename limitations, b) always return the latest version when sorted in lexicographical order. Returns: An instantiated ``DataCatalog`` containing all specified - data sets, created and ready to use. + datasets, created and ready to use. Raises: DatasetError: When the method fails to create any of the data @@ -356,10 +356,10 @@ def _get_dataset( return dataset def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set. + """Loads a registered dataset. Args: - name: A data set to be loaded. + name: A dataset to be loaded. version: Optional argument for concrete data version to be loaded. Works only with versioned datasets. @@ -367,7 +367,7 @@ def load(self, name: str, version: str | None = None) -> Any: The loaded data as configured. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. Example: @@ -398,15 +398,15 @@ def load(self, name: str, version: str | None = None) -> Any: return result def save(self, name: str, data: Any) -> None: - """Save data to a registered data set. + """Save data to a registered dataset. Args: - name: A data set to be saved to. + name: A dataset to be saved to. data: A data object to be saved as configured in the registered - data set. + dataset. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. Example: @@ -438,15 +438,15 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def exists(self, name: str) -> bool: - """Checks whether registered data set exists by calling its `exists()` + """Checks whether registered dataset exists by calling its `exists()` method. Raises a warning and returns False if `exists()` is not implemented. Args: - name: A data set to be checked. + name: A dataset to be checked. Returns: - Whether the data set output exists. + Whether the dataset output exists. """ try: @@ -456,13 +456,13 @@ def exists(self, name: str) -> bool: return dataset.exists() def release(self, name: str) -> None: - """Release any cached data associated with a data set + """Release any cached data associated with a dataset Args: - name: A data set to be checked. + name: A dataset to be checked. Raises: - DatasetNotFoundError: When a data set with the given name + DatasetNotFoundError: When a dataset with the given name has not yet been registered. """ dataset = self._get_dataset(name) @@ -477,15 +477,15 @@ def add( """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. Args: - dataset_name: A unique data set name which has not been + dataset_name: A unique dataset name which has not been registered yet. - dataset: A data set object to be associated with the given data + dataset: A dataset object to be associated with the given data set name. replace: Specifies whether to replace an existing dataset with the same name is allowed. Raises: - DatasetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a dataset with the same name has already been registered. Example: @@ -514,7 +514,7 @@ def add_all( datasets: dict[str, AbstractDataset], replace: bool = False, ) -> None: - """Adds a group of new data sets to the ``DataCatalog``. + """Adds a group of new datasets to the ``DataCatalog``. Args: datasets: A dictionary of dataset names and dataset @@ -523,7 +523,7 @@ def add_all( with the same name is allowed. Raises: - DatasetAlreadyExistsError: When a data set with the same name + DatasetAlreadyExistsError: When a dataset with the same name has already been registered. Example: @@ -597,10 +597,10 @@ def list(self, regex_search: str | None = None) -> list[str]: Args: regex_search: An optional regular expression which can be provided - to limit the data sets returned by a particular pattern. + to limit the datasets returned by a particular pattern. Returns: A list of dataset names available which match the - `regex_search` criteria (if provided). All data set names are returned + `regex_search` criteria (if provided). All dataset names are returned by default. Raises: @@ -610,11 +610,11 @@ def list(self, regex_search: str | None = None) -> list[str]: :: >>> catalog = DataCatalog() - >>> # get data sets where the substring 'raw' is present + >>> # get datasets where the substring 'raw' is present >>> raw_data = catalog.list(regex_search='raw') - >>> # get data sets which start with 'prm' or 'feat' + >>> # get datasets which start with 'prm' or 'feat' >>> feat_eng_data = catalog.list(regex_search='^(prm|feat)') - >>> # get data sets which end with 'time_series' + >>> # get datasets which end with 'time_series' >>> models = catalog.list(regex_search='.+time_series$') """ @@ -622,7 +622,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return list(self._datasets.keys()) if not regex_search.strip(): - self._logger.warning("The empty string will not match any data sets") + self._logger.warning("The empty string will not match any datasets") return [] try: diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index 043bb67737..d120f74ed2 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -11,11 +11,11 @@ class LambdaDataset(AbstractDataset): - """``LambdaDataset`` loads and saves data to a data set. + """``LambdaDataset`` loads and saves data to a dataset. It relies on delegating to specific implementation such as csv, sql, etc. ``LambdaDataset`` class captures Exceptions while performing operations on - composed ``Dataset`` implementations. The composed data set is + composed ``Dataset`` implementations. The composed dataset is responsible for providing information on how to resolve the issue when possible. This information should be available through str(error). @@ -53,7 +53,7 @@ def _to_str(func: Any) -> str | None: def _load(self) -> Any: if not self.__load: raise DatasetError( - "Cannot load data set. No 'load' function " + "Cannot load dataset. No 'load' function " "provided when LambdaDataset was created." ) return self.__load() @@ -61,7 +61,7 @@ def _load(self) -> Any: def _save(self, data: Any) -> None: if not self.__save: raise DatasetError( - "Cannot save to data set. No 'save' function " + "Cannot save to dataset. No 'save' function " "provided when LambdaDataset was created." ) self.__save(data) @@ -86,11 +86,11 @@ def __init__( metadata: dict[str, Any] | None = None, ): """Creates a new instance of ``LambdaDataset`` with references to the - required input/output data set methods. + required input/output dataset methods. Args: - load: Method to load data from a data set. - save: Method to save data to a data set. + load: Method to load data from a dataset. + save: Method to save data to a dataset. exists: Method to check whether output data already exists. release: Method to release any cached information. metadata: Any arbitrary metadata. diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 1b4bb8a371..1e8eef8452 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -1,4 +1,4 @@ -"""``MemoryDataset`` is a data set implementation which handles in-memory data.""" +"""``MemoryDataset`` is a dataset implementation which handles in-memory data.""" from __future__ import annotations diff --git a/kedro/pipeline/node.py b/kedro/pipeline/node.py index b382bee8cf..a303546279 100644 --- a/kedro/pipeline/node.py +++ b/kedro/pipeline/node.py @@ -59,7 +59,7 @@ def __init__( # noqa: PLR0913 contain only letters, digits, hyphens, underscores and/or fullstops. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling - ``confirm()`` method of the corresponding data set instance. + ``confirm()`` method of the corresponding dataset instance. Specified dataset names do not necessarily need to be present in the node ``inputs`` or ``outputs``. namespace: Optional node namespace. @@ -601,7 +601,7 @@ def node( # noqa: PLR0913 tags: Optional set of tags to be applied to the node. confirms: Optional name or the list of the names of the datasets that should be confirmed. This will result in calling ``confirm()`` - method of the corresponding data set instance. Specified dataset + method of the corresponding dataset instance. Specified dataset names do not necessarily need to be present in the node ``inputs`` or ``outputs``. namespace: Optional node namespace. diff --git a/kedro/pipeline/pipeline.py b/kedro/pipeline/pipeline.py index ab7365a154..749eea8548 100644 --- a/kedro/pipeline/pipeline.py +++ b/kedro/pipeline/pipeline.py @@ -93,8 +93,8 @@ def __init__( >>> from kedro.pipeline import node >>> >>> # In the following scenario first_ds and second_ds - >>> # are data sets provided by io. Pipeline will pass these - >>> # data sets to first_node function and provides the result + >>> # are datasets provided by io. Pipeline will pass these + >>> # datasets to first_node function and provides the result >>> # to the second_node as input. >>> >>> def first_node(first_ds, second_ds): @@ -247,11 +247,11 @@ def outputs(self) -> set[str]: return self._remove_intermediates(self.all_outputs()) def datasets(self) -> set[str]: - """The names of all data sets used by the ``Pipeline``, + """The names of all datasets used by the ``Pipeline``, including inputs and outputs. Returns: - The set of all pipeline data sets. + The set of all pipeline datasets. """ return self.all_outputs() | self.all_inputs() diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 7626bf8679..4bbcdc9ec5 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -43,7 +43,7 @@ class ParallelRunnerManager(SyncManager): """``ParallelRunnerManager`` is used to create shared ``MemoryDataset`` - objects as default data sets in a pipeline. + objects as default datasets in a pipeline. """ @@ -171,8 +171,8 @@ def _validate_nodes(cls, nodes: Iterable[Node]) -> None: @classmethod def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None: - """Ensure that all data sets are serialisable and that we do not have - any non proxied memory data sets being used as outputs as their content + """Ensure that all datasets are serialisable and that we do not have + any non proxied memory datasets being used as outputs as their content will not be synchronized across threads. """ @@ -190,9 +190,9 @@ def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None if unserialisable: raise AttributeError( - f"The following data sets cannot be used with multiprocessing: " + f"The following datasets cannot be used with multiprocessing: " f"{sorted(unserialisable)}\nIn order to utilize multiprocessing you " - f"need to make sure all data sets are serialisable, i.e. data sets " + f"need to make sure all datasets are serialisable, i.e. datasets " f"should not make use of lambda functions, nested functions, closures " f"etc.\nIf you are using custom decorators ensure they are correctly " f"decorated using functools.wraps()." @@ -209,7 +209,7 @@ def _validate_catalog(cls, catalog: CatalogProtocol, pipeline: Pipeline) -> None if memory_datasets: raise AttributeError( - f"The following data sets are memory data sets: " + f"The following datasets are memory datasets: " f"{sorted(memory_datasets)}\n" f"ParallelRunner does not support output to externally created " f"MemoryDatasets" diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index c888e737cf..57a7aef17f 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -81,7 +81,7 @@ def _run( self._suggest_resume_scenario(pipeline, done_nodes, catalog) raise - # decrement load counts and release any data sets we've finished with + # decrement load counts and release any datasets we've finished with for dataset in node.inputs: load_counts[dataset] -= 1 if load_counts[dataset] < 1 and dataset not in pipeline.inputs(): diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml index be73adae2a..789fc96fd1 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/base/catalog.yml @@ -1,4 +1,4 @@ -# Here you can define all your data sets by using simple YAML syntax. +# Here you can define all your datasets by using simple YAML syntax. # # Documentation for this file format can be found in "The Data Catalog" # Link: https://docs.kedro.org/en/stable/data/data_catalog.html diff --git a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml index b2db154dbc..b9a9cea667 100644 --- a/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml +++ b/kedro/templates/project/{{ cookiecutter.repo_name }}/conf/local/credentials.yml @@ -1,4 +1,4 @@ -# Here you can define credentials for different data sets and environment. +# Here you can define credentials for different datasets and environment. # # # Example: diff --git a/tests/io/test_core.py b/tests/io/test_core.py index 4128ad6da2..286a7142fd 100644 --- a/tests/io/test_core.py +++ b/tests/io/test_core.py @@ -359,7 +359,7 @@ def test_version_str_repr(self, load_version, save_version): def test_save_and_load(self, my_versioned_dataset, dummy_data): """Test that saved and reloaded data matches the original one for - the versioned data set.""" + the versioned dataset.""" my_versioned_dataset.save(dummy_data) reloaded = my_versioned_dataset.load() assert dummy_data == reloaded @@ -398,14 +398,14 @@ def test_exists_general_exception(self): my_other_versioned_dataset.exists() def test_exists(self, my_versioned_dataset, dummy_data): - """Test `exists` method invocation for versioned data set.""" + """Test `exists` method invocation for versioned dataset.""" assert not my_versioned_dataset.exists() my_versioned_dataset.save(dummy_data) assert my_versioned_dataset.exists() shutil.rmtree(my_versioned_dataset._filepath) def test_prevent_overwrite(self, my_versioned_dataset, dummy_data): - """Check the error when attempting to override the data set if the + """Check the error when attempting to override the dataset if the corresponding json file for a given save version already exists.""" my_versioned_dataset.save(dummy_data) pattern = ( @@ -550,7 +550,7 @@ def test_saving_none(self, my_legacy_dataset): my_legacy_dataset.save(None) def test_saving_invalid_data(self, my_legacy_dataset, dummy_data): - pattern = r"Failed while saving data to data set" + pattern = r"Failed while saving data to dataset" with pytest.raises(DatasetError, match=pattern): my_legacy_dataset.save(pd.DataFrame()) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 54cbdf340d..bbaf6e8c6b 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -168,14 +168,14 @@ def data_catalog_from_config(correct_config): class TestDataCatalog: def test_save_and_load(self, data_catalog, dummy_dataframe): - """Test saving and reloading the data set""" + """Test saving and reloading the dataset""" data_catalog.save("test", dummy_dataframe) reloaded_df = data_catalog.load("test") assert_frame_equal(reloaded_df, dummy_dataframe) def test_add_save_and_load(self, dataset, dummy_dataframe): - """Test adding and then saving and reloading the data set""" + """Test adding and then saving and reloading the dataset""" catalog = DataCatalog(datasets={}) catalog.add("test", dataset) catalog.save("test", dummy_dataframe) @@ -185,7 +185,7 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): def test_add_all_save_and_load(self, dataset, dummy_dataframe): """Test adding all to the data catalog and then saving and reloading - the data set""" + the dataset""" catalog = DataCatalog(datasets={}) catalog.add_all({"test": dataset}) catalog.save("test", dummy_dataframe) @@ -194,34 +194,34 @@ def test_add_all_save_and_load(self, dataset, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_load_error(self, data_catalog): - """Check the error when attempting to load a data set + """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from data set CSVDataset" + pattern = r"Failed while loading data from dataset CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") def test_add_dataset_twice(self, data_catalog, dataset): - """Check the error when attempting to add the data set twice""" + """Check the error when attempting to add the dataset twice""" pattern = r"Dataset 'test' has already been registered" with pytest.raises(DatasetAlreadyExistsError, match=pattern): data_catalog.add("test", dataset) def test_load_from_unregistered(self): - """Check the error when attempting to load unregistered data set""" + """Check the error when attempting to load unregistered dataset""" catalog = DataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.load("test") def test_save_to_unregistered(self, dummy_dataframe): - """Check the error when attempting to save to unregistered data set""" + """Check the error when attempting to save to unregistered dataset""" catalog = DataCatalog(datasets={}) pattern = r"Dataset 'test' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern): catalog.save("test", dummy_dataframe) def test_feed_dict(self, memory_catalog, conflicting_feed_dict): - """Test feed dict overriding some of the data sets""" + """Test feed dict overriding some of the datasets""" memory_catalog.add_feed_dict(conflicting_feed_dict, replace=True) assert "data" in memory_catalog.load("ds1") assert memory_catalog.load("ds1")["data"] == 0 @@ -235,7 +235,7 @@ def test_exists(self, data_catalog, dummy_dataframe): assert data_catalog.exists("test") def test_exists_not_implemented(self, caplog): - """Test calling `exists` on the data set, which didn't implement it""" + """Test calling `exists` on the dataset, which didn't implement it""" catalog = DataCatalog(datasets={"test": LambdaDataset(None, None)}) result = catalog.exists("test") @@ -248,18 +248,18 @@ def test_exists_not_implemented(self, caplog): assert result is False def test_exists_invalid(self, data_catalog): - """Check the error when calling `exists` on invalid data set""" + """Check the error when calling `exists` on invalid dataset""" assert not data_catalog.exists("wrong_key") def test_release_unregistered(self, data_catalog): - """Check the error when calling `release` on unregistered data set""" + """Check the error when calling `release` on unregistered dataset""" pattern = r"Dataset \'wrong_key\' not found in the catalog" with pytest.raises(DatasetNotFoundError, match=pattern) as e: data_catalog.release("wrong_key") assert "did you mean" not in str(e.value) def test_release_unregistered_typo(self, data_catalog): - """Check the error when calling `release` on mistyped data set""" + """Check the error when calling `release` on mistyped dataset""" pattern = ( "Dataset 'text' not found in the catalog" " - did you mean one of these instead: test" @@ -268,7 +268,7 @@ def test_release_unregistered_typo(self, data_catalog): data_catalog.release("text") def test_multi_catalog_list(self, multi_catalog): - """Test data catalog which contains multiple data sets""" + """Test data catalog which contains multiple datasets""" entries = multi_catalog.list() assert "abc" in entries assert "xyz" in entries @@ -284,7 +284,7 @@ def test_multi_catalog_list(self, multi_catalog): ], ) def test_multi_catalog_list_regex(self, multi_catalog, pattern, expected): - """Test that regex patterns filter data sets accordingly""" + """Test that regex patterns filter datasets accordingly""" assert multi_catalog.list(regex_search=pattern) == expected def test_multi_catalog_list_bad_regex(self, multi_catalog): @@ -404,7 +404,7 @@ def test_from_correct_config(self, data_catalog_from_config, dummy_dataframe): assert_frame_equal(reloaded_df, dummy_dataframe) def test_config_missing_type(self, correct_config): - """Check the error if type attribute is missing for some data set(s) + """Check the error if type attribute is missing for some dataset(s) in the config""" del correct_config["catalog"]["boats"]["type"] pattern = ( @@ -468,13 +468,13 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.data_catalog.DataCatalog' is invalid: " - "all data set types must extend 'AbstractDataset'" + "all dataset types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): DataCatalog.from_config(**correct_config) def test_config_invalid_arguments(self, correct_config): - """Check the error if the data set config contains invalid arguments""" + """Check the error if the dataset config contains invalid arguments""" correct_config["catalog"]["boats"]["save_and_load_args"] = False pattern = ( r"Dataset 'boats' must only contain arguments valid for " @@ -504,7 +504,7 @@ def test_missing_credentials(self, correct_config): DataCatalog.from_config(**correct_config) def test_link_credentials(self, correct_config, mocker): - """Test credentials being linked to the relevant data set""" + """Test credentials being linked to the relevant dataset""" mock_client = mocker.patch("kedro_datasets.pandas.csv_dataset.fsspec") config = deepcopy(correct_config) del config["catalog"]["boats"] @@ -560,7 +560,7 @@ def test_idempotent_catalog(self, correct_config): assert catalog def test_error_dataset_init(self, bad_config): - """Check the error when trying to instantiate erroneous data set""" + """Check the error when trying to instantiate erroneous dataset""" pattern = r"Failed to instantiate dataset \'bad\' of type '.*BadDataset'" with pytest.raises(DatasetError, match=pattern): DataCatalog.from_config(bad_config, None) @@ -606,7 +606,7 @@ def test_bad_confirm(self, correct_config, dataset_name, pattern): class TestDataCatalogVersioned: def test_from_correct_config_versioned(self, correct_config, dummy_dataframe): - """Test load and save of versioned data sets from config""" + """Test load and save of versioned datasets from config""" correct_config["catalog"]["boats"]["versioned"] = True # Decompose `generate_timestamp` to keep `current_ts` reference. @@ -649,13 +649,13 @@ def test_from_correct_config_versioned_warn( self, caplog, correct_config, versioned ): """Check the warning if `version` attribute was added - to the data set config""" + to the dataset config""" correct_config["catalog"]["boats"]["versioned"] = versioned correct_config["catalog"]["boats"]["version"] = True DataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from data set configuration since it " + "'version' attribute removed from dataset configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" @@ -672,7 +672,7 @@ def test_from_correct_config_load_versions_warn(self, correct_config): def test_compare_tracking_and_other_dataset_versioned( self, correct_config_with_tracking_ds, dummy_dataframe ): - """Test saving of tracking data sets from config results in the same + """Test saving of tracking datasets from config results in the same save version as other versioned datasets.""" catalog = DataCatalog.from_config(**correct_config_with_tracking_ds) @@ -694,7 +694,7 @@ def test_compare_tracking_and_other_dataset_versioned( assert tracking_timestamp == csv_timestamp def test_load_version(self, correct_config, dummy_dataframe, mocker): - """Test load versioned data sets from config""" + """Test load versioned datasets from config""" new_dataframe = pd.DataFrame({"col1": [0, 0], "col2": [0, 0], "col3": [0, 0]}) correct_config["catalog"]["boats"]["versioned"] = True mocker.patch( @@ -938,7 +938,7 @@ def test_unmatched_key_error_when_parsing_config( def test_factory_config_versioned( self, config_with_dataset_factories, filepath, dummy_dataframe ): - """Test load and save of versioned data sets from config""" + """Test load and save of versioned datasets from config""" config_with_dataset_factories["catalog"]["{brand}_cars"]["versioned"] = True config_with_dataset_factories["catalog"]["{brand}_cars"]["filepath"] = filepath diff --git a/tests/io/test_kedro_data_catalog.py b/tests/io/test_kedro_data_catalog.py index 5e0c463e7d..efa993bb0e 100644 --- a/tests/io/test_kedro_data_catalog.py +++ b/tests/io/test_kedro_data_catalog.py @@ -74,7 +74,7 @@ def test_add_save_and_load(self, dataset, dummy_dataframe): def test_load_error(self, data_catalog): """Check the error when attempting to load a dataset from nonexistent source""" - pattern = r"Failed while loading data from data set CSVDataset" + pattern = r"Failed while loading data from dataset CSVDataset" with pytest.raises(DatasetError, match=pattern): data_catalog.load("test") @@ -352,7 +352,7 @@ def test_config_invalid_dataset(self, correct_config): pattern = ( "An exception occurred when parsing config for dataset 'boats':\n" "Dataset type 'kedro.io.kedro_data_catalog.KedroDataCatalog' is invalid: " - "all data set types must extend 'AbstractDataset'" + "all dataset types must extend 'AbstractDataset'" ) with pytest.raises(DatasetError, match=re.escape(pattern)): KedroDataCatalog.from_config(**correct_config) @@ -553,7 +553,7 @@ def test_from_correct_config_versioned_warn( KedroDataCatalog.from_config(**correct_config) log_record = caplog.records[0] expected_log_message = ( - "'version' attribute removed from data set configuration since it " + "'version' attribute removed from dataset configuration since it " "is a reserved word and cannot be directly specified" ) assert log_record.levelname == "WARNING" diff --git a/tests/io/test_lambda_dataset.py b/tests/io/test_lambda_dataset.py index a3072af451..eac9709d04 100644 --- a/tests/io/test_lambda_dataset.py +++ b/tests/io/test_lambda_dataset.py @@ -104,7 +104,7 @@ def internal_load(): def test_load_undefined(self): """Check the error if `LambdaDataset.__load` is None""" - with pytest.raises(DatasetError, match="Cannot load data set"): + with pytest.raises(DatasetError, match="Cannot load dataset"): LambdaDataset(None, None).load() def test_load_not_callable(self): @@ -128,7 +128,7 @@ def test_save_raises_error(self, mocked_save, mocked_dataset): mocked_save.side_effect = FileExistsError(error_message) pattern = ( - r"Failed while saving data to data set LambdaDataset\(.+\)\.\n" + r"Failed while saving data to dataset LambdaDataset\(.+\)\.\n" + error_message ) with pytest.raises(DatasetError, match=pattern): @@ -137,7 +137,7 @@ def test_save_raises_error(self, mocked_save, mocked_dataset): def test_save_undefined(self): """Check the error if `LambdaDataset.__save` is None""" - with pytest.raises(DatasetError, match="Cannot save to data set"): + with pytest.raises(DatasetError, match="Cannot save to dataset"): LambdaDataset(None, None).save(42) def test_save_none(self, mocked_save, mocked_dataset): diff --git a/tests/pipeline/test_pipeline_from_missing.py b/tests/pipeline/test_pipeline_from_missing.py index f399e70c06..4e40638d83 100644 --- a/tests/pipeline/test_pipeline_from_missing.py +++ b/tests/pipeline/test_pipeline_from_missing.py @@ -210,7 +210,7 @@ def test_partial_propagation(self, branched_pipeline, hook_manager): assert _pipeline_contains(new_pipeline, ["split", "right_out"]) def test_partial_non_existent_propagation(self, branched_pipeline, hook_manager): - """A non existent data set whose node has one unregistered input + """A non existent dataset whose node has one unregistered input and one existent input should be recalculated correctly. """ catalog = _make_catalog(existent=["A", "C", "E", "F"], non_existent=["D"]) From e071640ae1c9fbbb5419291412ecc2a55b65de0f Mon Sep 17 00:00:00 2001 From: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Date: Thu, 10 Oct 2024 19:33:42 +0100 Subject: [PATCH 04/19] Manually created sitemap.xml for improved control over indexed docs pages (#4145) * Load manually created sitemap Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> * Add projects remove lastmod for latest Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> * Add latest for projects Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> --------- Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Co-authored-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Signed-off-by: Ankita Katiyar --- docs/source/sitemap.xml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 docs/source/sitemap.xml diff --git a/docs/source/sitemap.xml b/docs/source/sitemap.xml new file mode 100644 index 0000000000..059f1ac1c6 --- /dev/null +++ b/docs/source/sitemap.xml @@ -0,0 +1,35 @@ + + + https://docs.kedro.org/en/stable/ + 2024-09-01T18:53:11.571849+00:00 + monthly + 1.0 + + + https://docs.kedro.org/en/latest/ + daily + 0.5 + + + https://docs.kedro.org/projects/kedro-viz/en/stable/ + 2024-09-01T18:53:11.571849+00:00 + monthly + 1.0 + + + https://docs.kedro.org/projects/kedro-viz/en/latest/ + daily + 0.5 + + + https://docs.kedro.org/projects/kedro-datasets/en/stable/ + 2024-09-01T18:53:11.571849+00:00 + monthly + 1.0 + + + https://docs.kedro.org/projects/kedro-datasets/en/latest/ + daily + 0.5 + + From 8cb24c40ee0d0a8ea24d54a497219ff7603cfd98 Mon Sep 17 00:00:00 2001 From: "L. R. Couto" <57910428+lrcouto@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:57:08 -0300 Subject: [PATCH 05/19] Bump up version to 0.19.9 (#4219) * Bump up version to 0.19.9 Signed-off-by: Laura Couto * Add placeholders to release.md Signed-off-by: Laura Couto * Update citation.cff release date Signed-off-by: Laura Couto --------- Signed-off-by: Laura Couto Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Signed-off-by: Ankita Katiyar --- CITATION.cff | 4 ++-- RELEASE.md | 9 +++++++++ kedro/__init__.py | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 3f57feb252..371e42a1b0 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -44,6 +44,6 @@ authors: - family-names: Brugman given-names: Simon title: Kedro -version: 0.19.8 -date-released: 2024-08-20 +version: 0.19.9 +date-released: 2024-10-10 url: https://github.com/kedro-org/kedro diff --git a/RELEASE.md b/RELEASE.md index 5447340938..59cace8a36 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,14 @@ # Upcoming Release +## Major features and improvements +## Bug fixes and other changes +## Breaking changes to the API +## Documentation changes +## Community contributions + + +# Release 0.19.9 + ## Major features and improvements * Dropped Python 3.8 support. * Implemented `KedroDataCatalog` repeating `DataCatalog` functionality with a few API enhancements: diff --git a/kedro/__init__.py b/kedro/__init__.py index b49d498fc9..00ebebc5a7 100644 --- a/kedro/__init__.py +++ b/kedro/__init__.py @@ -6,7 +6,7 @@ import sys import warnings -__version__ = "0.19.8" +__version__ = "0.19.9" class KedroDeprecationWarning(DeprecationWarning): From 3b2878e0bf04a25e362114b4940a23481cbd27d7 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Fri, 11 Oct 2024 13:38:09 +0100 Subject: [PATCH 06/19] first pass doesn't work yet Signed-off-by: Ankita Katiyar --- .../91468e64-virtualenv-py3.11.json | 1 + .asv/results/M-WFLM6NH6G5/machine.json | 9 +++ .asv/results/benchmarks.json | 78 +++++++++++++++++++ benchmarks/OmegaConfigLoader/__init__.py | 0 benchmarks/OmegaConfigLoader/benchmark_ocl.py | 60 ++++++++++++++ .../OmegaConfigLoader/conf/base/catalog.yml | 0 .../OmegaConfigLoader/conf/base/globals.yml | 0 .../conf/base/parameters.yml | 0 .../OmegaConfigLoader/conf/local/catalog.yml | 0 .../OmegaConfigLoader/conf/local/globals.yml | 0 .../conf/local/parameters.yml | 0 11 files changed, 148 insertions(+) create mode 100644 .asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json create mode 100644 .asv/results/M-WFLM6NH6G5/machine.json create mode 100644 .asv/results/benchmarks.json create mode 100644 benchmarks/OmegaConfigLoader/__init__.py create mode 100644 benchmarks/OmegaConfigLoader/benchmark_ocl.py create mode 100644 benchmarks/OmegaConfigLoader/conf/base/catalog.yml create mode 100644 benchmarks/OmegaConfigLoader/conf/base/globals.yml create mode 100644 benchmarks/OmegaConfigLoader/conf/base/parameters.yml create mode 100644 benchmarks/OmegaConfigLoader/conf/local/catalog.yml create mode 100644 benchmarks/OmegaConfigLoader/conf/local/globals.yml create mode 100644 benchmarks/OmegaConfigLoader/conf/local/parameters.yml diff --git a/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json b/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json new file mode 100644 index 0000000000..f25d79f0bb --- /dev/null +++ b/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json @@ -0,0 +1 @@ +{"commit_hash": "91468e64ea6f1fc4d51fe6313d738476189dd74a", "env_name": "virtualenv-py3.11", "date": 1728586628000, "params": {"arch": "arm64", "cpu": "Apple M1 Max", "machine": "M-WFLM6NH6G5", "num_cpu": "10", "os": "Darwin 23.6.0", "ram": "34359738368", "python": "3.11"}, "python": "3.11", "requirements": {}, "env_vars": {}, "result_columns": ["result", "params", "version", "started_at", "duration", "stats_ci_99_a", "stats_ci_99_b", "stats_q_25", "stats_q_75", "stats_number", "stats_repeat", "samples", "profile"], "results": {"benchmark_dummy.TimeSuite.time_keys": [[2.541248086432182e-06], [], "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", 1728644924899, 0.63448, [2.5262e-06], [2.6412e-06], [2.533e-06], [2.5861e-06], [4180], [10]], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config": [[4.0286439130507145e-05], [], "d40454765b26efac921c78a6fef4a045a8e533266f53becd29e5b3d960de881a", 1728644924470, 0.90424, [3.9613e-05], [4.1723e-05], [3.9864e-05], [4.1201e-05], [271], [10]], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config": [null, [], "a0fd9c77896289880cc8bbde02dc4ef9b7eb6bcba23d6c91a23ccb202d391b9f", 1728644923139, 0.3623], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy": [null, [], "bc851f1a9364e2a0de2d111f30652d404e1b61da33d1377799c163c00b2690f8", 1728644923502, 0.31744], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": [null, [], "08a283d77079d9fcf30a4863dfe0bafbd85d27b7e84b13eca9a9a7267eaf5b16", 1728644923819, 0.33724]}, "durations": {}, "version": 2} \ No newline at end of file diff --git a/.asv/results/M-WFLM6NH6G5/machine.json b/.asv/results/M-WFLM6NH6G5/machine.json new file mode 100644 index 0000000000..3fe4186a75 --- /dev/null +++ b/.asv/results/M-WFLM6NH6G5/machine.json @@ -0,0 +1,9 @@ +{ + "arch": "arm64", + "cpu": "Apple M1 Max", + "machine": "M-WFLM6NH6G5", + "num_cpu": "10", + "os": "Darwin 23.6.0", + "ram": "34359738368", + "version": 1 +} \ No newline at end of file diff --git a/.asv/results/benchmarks.json b/.asv/results/benchmarks.json new file mode 100644 index 0000000000..1153a1f2fd --- /dev/null +++ b/.asv/results/benchmarks.json @@ -0,0 +1,78 @@ +{ + "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_base_config(self):\n \"\"\"Benchmark the time to load the base configuration\"\"\"\n config = self.loader[\"globals\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", + "min_run_count": 2, + "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "d40454765b26efac921c78a6fef4a045a8e533266f53becd29e5b3d960de881a", + "warmup_time": -1 + }, + "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_env_config(self):\n \"\"\"Benchmark the time to load environment-specific configuration\"\"\"\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", + "min_run_count": 2, + "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "a0fd9c77896289880cc8bbde02dc4ef9b7eb6bcba23d6c91a23ccb202d391b9f", + "warmup_time": -1 + }, + "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy": { + "code": "class TimeOmegaConfigLoader:\n def time_merge_destructive_strategy(self):\n \"\"\"Benchmark the time to load and destructively merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"destructive\"}\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", + "min_run_count": 2, + "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "bc851f1a9364e2a0de2d111f30652d404e1b61da33d1377799c163c00b2690f8", + "warmup_time": -1 + }, + "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": { + "code": "class TimeOmegaConfigLoader:\n def time_merge_soft_strategy(self):\n \"\"\"Benchmark the time to load and soft-merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"soft\"}\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", + "min_run_count": 2, + "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "08a283d77079d9fcf30a4863dfe0bafbd85d27b7e84b13eca9a9a7267eaf5b16", + "warmup_time": -1 + }, + "benchmark_dummy.TimeSuite.time_keys": { + "code": "class TimeSuite:\n def time_keys(self):\n for key in self.d.keys():\n pass\n\n def setup(self):\n self.d = {}\n for x in range(500):\n self.d[x] = None", + "min_run_count": 2, + "name": "benchmark_dummy.TimeSuite.time_keys", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", + "warmup_time": -1 + }, + "version": 2 +} \ No newline at end of file diff --git a/benchmarks/OmegaConfigLoader/__init__.py b/benchmarks/OmegaConfigLoader/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/benchmark_ocl.py b/benchmarks/OmegaConfigLoader/benchmark_ocl.py new file mode 100644 index 0000000000..c9d1badeda --- /dev/null +++ b/benchmarks/OmegaConfigLoader/benchmark_ocl.py @@ -0,0 +1,60 @@ +from pathlib import Path +import os +import tempfile +from kedro.config import OmegaConfigLoader + +class TimeOmegaConfigLoader: + + def setup(self): + # Setup temporary configuration directory with sample config files + self.temp_dir = tempfile.TemporaryDirectory() + self.conf_source = Path(self.temp_dir.name) + self.env = "local" + + # Create sample config files in the temp directory + self._create_config_file("base", "globals.yml", {"global_param": "value"}) + self._create_config_file("base", "catalog.yml", {"dataset": {"type": "pandas.CSVDataSet"}}) + self._create_config_file("local", "catalog.yml", {"dataset": {"filepath": "data.csv"}}) + + # Instantiate the OmegaConfigLoader + self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env) + + def teardown(self): + # Cleanup temporary directory + self.temp_dir.cleanup() + + def _create_config_file(self, env, file_name, data): + env_path = self.conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name + + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) + + def time_loading_base_config(self): + """Benchmark the time to load the base configuration""" + config = self.loader["globals"] + + def time_loading_env_config(self): + """Benchmark the time to load environment-specific configuration""" + config = self.loader["catalog"] + + def time_merge_soft_strategy(self): + """Benchmark the time to load and soft-merge configurations""" + self.loader.merge_strategy = {"catalog": "soft"} + config = self.loader["catalog"] + + def time_merge_destructive_strategy(self): + """Benchmark the time to load and destructively merge configurations""" + self.loader.merge_strategy = {"catalog": "destructive"} + config = self.loader["catalog"] + + def peak_memory_loading_config(self): + """Benchmark peak memory usage during config loading""" + config = self.loader["catalog"] + + def peak_memory_soft_merge(self): + """Benchmark peak memory usage during soft merge""" + self.loader.merge_strategy = {"catalog": "soft"} + config = self.loader["catalog"] diff --git a/benchmarks/OmegaConfigLoader/conf/base/catalog.yml b/benchmarks/OmegaConfigLoader/conf/base/catalog.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/conf/base/globals.yml b/benchmarks/OmegaConfigLoader/conf/base/globals.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/conf/base/parameters.yml b/benchmarks/OmegaConfigLoader/conf/base/parameters.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/conf/local/catalog.yml b/benchmarks/OmegaConfigLoader/conf/local/catalog.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/conf/local/globals.yml b/benchmarks/OmegaConfigLoader/conf/local/globals.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/OmegaConfigLoader/conf/local/parameters.yml b/benchmarks/OmegaConfigLoader/conf/local/parameters.yml new file mode 100644 index 0000000000..e69de29bb2 From f2f177089b0354b07d9cbaaaa4171dae8f85e522 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 14 Oct 2024 11:47:45 +0100 Subject: [PATCH 07/19] Update ocl tests Signed-off-by: Ankita Katiyar --- .../91468e64-virtualenv-py3.11.json | 1 - .asv/results/M-WFLM6NH6G5/machine.json | 9 - .asv/results/benchmarks.json | 78 --------- benchmarks/OmegaConfigLoader/__init__.py | 0 benchmarks/OmegaConfigLoader/benchmark_ocl.py | 60 ------- .../OmegaConfigLoader/conf/base/catalog.yml | 0 .../OmegaConfigLoader/conf/base/globals.yml | 0 .../conf/base/parameters.yml | 0 .../OmegaConfigLoader/conf/local/catalog.yml | 0 .../OmegaConfigLoader/conf/local/globals.yml | 0 .../conf/local/parameters.yml | 0 benchmarks/benchmark_ocl.py | 161 ++++++++++++++++++ pyproject.toml | 6 +- 13 files changed, 164 insertions(+), 151 deletions(-) delete mode 100644 .asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json delete mode 100644 .asv/results/M-WFLM6NH6G5/machine.json delete mode 100644 .asv/results/benchmarks.json delete mode 100644 benchmarks/OmegaConfigLoader/__init__.py delete mode 100644 benchmarks/OmegaConfigLoader/benchmark_ocl.py delete mode 100644 benchmarks/OmegaConfigLoader/conf/base/catalog.yml delete mode 100644 benchmarks/OmegaConfigLoader/conf/base/globals.yml delete mode 100644 benchmarks/OmegaConfigLoader/conf/base/parameters.yml delete mode 100644 benchmarks/OmegaConfigLoader/conf/local/catalog.yml delete mode 100644 benchmarks/OmegaConfigLoader/conf/local/globals.yml delete mode 100644 benchmarks/OmegaConfigLoader/conf/local/parameters.yml create mode 100644 benchmarks/benchmark_ocl.py diff --git a/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json b/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json deleted file mode 100644 index f25d79f0bb..0000000000 --- a/.asv/results/M-WFLM6NH6G5/91468e64-virtualenv-py3.11.json +++ /dev/null @@ -1 +0,0 @@ -{"commit_hash": "91468e64ea6f1fc4d51fe6313d738476189dd74a", "env_name": "virtualenv-py3.11", "date": 1728586628000, "params": {"arch": "arm64", "cpu": "Apple M1 Max", "machine": "M-WFLM6NH6G5", "num_cpu": "10", "os": "Darwin 23.6.0", "ram": "34359738368", "python": "3.11"}, "python": "3.11", "requirements": {}, "env_vars": {}, "result_columns": ["result", "params", "version", "started_at", "duration", "stats_ci_99_a", "stats_ci_99_b", "stats_q_25", "stats_q_75", "stats_number", "stats_repeat", "samples", "profile"], "results": {"benchmark_dummy.TimeSuite.time_keys": [[2.541248086432182e-06], [], "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", 1728644924899, 0.63448, [2.5262e-06], [2.6412e-06], [2.533e-06], [2.5861e-06], [4180], [10]], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config": [[4.0286439130507145e-05], [], "d40454765b26efac921c78a6fef4a045a8e533266f53becd29e5b3d960de881a", 1728644924470, 0.90424, [3.9613e-05], [4.1723e-05], [3.9864e-05], [4.1201e-05], [271], [10]], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config": [null, [], "a0fd9c77896289880cc8bbde02dc4ef9b7eb6bcba23d6c91a23ccb202d391b9f", 1728644923139, 0.3623], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy": [null, [], "bc851f1a9364e2a0de2d111f30652d404e1b61da33d1377799c163c00b2690f8", 1728644923502, 0.31744], "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": [null, [], "08a283d77079d9fcf30a4863dfe0bafbd85d27b7e84b13eca9a9a7267eaf5b16", 1728644923819, 0.33724]}, "durations": {}, "version": 2} \ No newline at end of file diff --git a/.asv/results/M-WFLM6NH6G5/machine.json b/.asv/results/M-WFLM6NH6G5/machine.json deleted file mode 100644 index 3fe4186a75..0000000000 --- a/.asv/results/M-WFLM6NH6G5/machine.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "arch": "arm64", - "cpu": "Apple M1 Max", - "machine": "M-WFLM6NH6G5", - "num_cpu": "10", - "os": "Darwin 23.6.0", - "ram": "34359738368", - "version": 1 -} \ No newline at end of file diff --git a/.asv/results/benchmarks.json b/.asv/results/benchmarks.json deleted file mode 100644 index 1153a1f2fd..0000000000 --- a/.asv/results/benchmarks.json +++ /dev/null @@ -1,78 +0,0 @@ -{ - "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_base_config(self):\n \"\"\"Benchmark the time to load the base configuration\"\"\"\n config = self.loader[\"globals\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", - "min_run_count": 2, - "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_base_config", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "d40454765b26efac921c78a6fef4a045a8e533266f53becd29e5b3d960de881a", - "warmup_time": -1 - }, - "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_env_config(self):\n \"\"\"Benchmark the time to load environment-specific configuration\"\"\"\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", - "min_run_count": 2, - "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_loading_env_config", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "a0fd9c77896289880cc8bbde02dc4ef9b7eb6bcba23d6c91a23ccb202d391b9f", - "warmup_time": -1 - }, - "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy": { - "code": "class TimeOmegaConfigLoader:\n def time_merge_destructive_strategy(self):\n \"\"\"Benchmark the time to load and destructively merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"destructive\"}\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", - "min_run_count": 2, - "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_destructive_strategy", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "bc851f1a9364e2a0de2d111f30652d404e1b61da33d1377799c163c00b2690f8", - "warmup_time": -1 - }, - "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": { - "code": "class TimeOmegaConfigLoader:\n def time_merge_soft_strategy(self):\n \"\"\"Benchmark the time to load and soft-merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"soft\"}\n config = self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n self.env = \"local\"\n \n # Create sample config files in the temp directory\n self._create_config_file(\"base\", \"globals.yml\", {\"global_param\": \"value\"})\n self._create_config_file(\"base\", \"catalog.yml\", {\"dataset\": {\"type\": \"pandas.CSVDataSet\"}})\n self._create_config_file(\"local\", \"catalog.yml\", {\"dataset\": {\"filepath\": \"data.csv\"}})\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env)", - "min_run_count": 2, - "name": "OmegaConfigLoader.benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "08a283d77079d9fcf30a4863dfe0bafbd85d27b7e84b13eca9a9a7267eaf5b16", - "warmup_time": -1 - }, - "benchmark_dummy.TimeSuite.time_keys": { - "code": "class TimeSuite:\n def time_keys(self):\n for key in self.d.keys():\n pass\n\n def setup(self):\n self.d = {}\n for x in range(500):\n self.d[x] = None", - "min_run_count": 2, - "name": "benchmark_dummy.TimeSuite.time_keys", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", - "warmup_time": -1 - }, - "version": 2 -} \ No newline at end of file diff --git a/benchmarks/OmegaConfigLoader/__init__.py b/benchmarks/OmegaConfigLoader/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/benchmark_ocl.py b/benchmarks/OmegaConfigLoader/benchmark_ocl.py deleted file mode 100644 index c9d1badeda..0000000000 --- a/benchmarks/OmegaConfigLoader/benchmark_ocl.py +++ /dev/null @@ -1,60 +0,0 @@ -from pathlib import Path -import os -import tempfile -from kedro.config import OmegaConfigLoader - -class TimeOmegaConfigLoader: - - def setup(self): - # Setup temporary configuration directory with sample config files - self.temp_dir = tempfile.TemporaryDirectory() - self.conf_source = Path(self.temp_dir.name) - self.env = "local" - - # Create sample config files in the temp directory - self._create_config_file("base", "globals.yml", {"global_param": "value"}) - self._create_config_file("base", "catalog.yml", {"dataset": {"type": "pandas.CSVDataSet"}}) - self._create_config_file("local", "catalog.yml", {"dataset": {"filepath": "data.csv"}}) - - # Instantiate the OmegaConfigLoader - self.loader = OmegaConfigLoader(conf_source=self.conf_source.as_posix(), env=self.env) - - def teardown(self): - # Cleanup temporary directory - self.temp_dir.cleanup() - - def _create_config_file(self, env, file_name, data): - env_path = self.conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name - - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) - - def time_loading_base_config(self): - """Benchmark the time to load the base configuration""" - config = self.loader["globals"] - - def time_loading_env_config(self): - """Benchmark the time to load environment-specific configuration""" - config = self.loader["catalog"] - - def time_merge_soft_strategy(self): - """Benchmark the time to load and soft-merge configurations""" - self.loader.merge_strategy = {"catalog": "soft"} - config = self.loader["catalog"] - - def time_merge_destructive_strategy(self): - """Benchmark the time to load and destructively merge configurations""" - self.loader.merge_strategy = {"catalog": "destructive"} - config = self.loader["catalog"] - - def peak_memory_loading_config(self): - """Benchmark peak memory usage during config loading""" - config = self.loader["catalog"] - - def peak_memory_soft_merge(self): - """Benchmark peak memory usage during soft merge""" - self.loader.merge_strategy = {"catalog": "soft"} - config = self.loader["catalog"] diff --git a/benchmarks/OmegaConfigLoader/conf/base/catalog.yml b/benchmarks/OmegaConfigLoader/conf/base/catalog.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/conf/base/globals.yml b/benchmarks/OmegaConfigLoader/conf/base/globals.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/conf/base/parameters.yml b/benchmarks/OmegaConfigLoader/conf/base/parameters.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/conf/local/catalog.yml b/benchmarks/OmegaConfigLoader/conf/local/catalog.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/conf/local/globals.yml b/benchmarks/OmegaConfigLoader/conf/local/globals.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/OmegaConfigLoader/conf/local/parameters.yml b/benchmarks/OmegaConfigLoader/conf/local/parameters.yml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py new file mode 100644 index 0000000000..a4b7dda5ed --- /dev/null +++ b/benchmarks/benchmark_ocl.py @@ -0,0 +1,161 @@ +import tempfile +from pathlib import Path + +from kedro.config import OmegaConfigLoader + +base_catalog = { + "dataset_1": { + "type": "pandas.CSVDataset", + "filepath": "data1.csv" + }, + "dataset_2": { + "type": "pandas.CSVDataset", + "filepath": "data2.csv" + }, + "dataset_3": { + "type": "pandas.CSVDataset", + "filepath": "data3.csv" + }, + "dataset_4": { + "type": "pandas.CSVDataset", + "filepath": "data4.csv", + "versioned": True, + }, +} +local_catalog = { + "dataset_4" : { + "filepath": "data4_local.csv", + "type": "pandas.CSVDataset", + }, + "dataset_5" : { + "filepath": "data5_local.csv", + "type": "pandas.CSVDataset", + }, +} +base_params = { + "param_1": "value_1", + "param_2": "value_2", + "param_3": "value_3", + "param_4": "value_4", +} +local_params = { + "param_4": "value_4_local", + "param_5": "value_5_local", +} +base_globals = { + "global1": "value1", + "global2": "value2", + "global3": "value3", + "global4": "value4", +} +local_globals = { + "global4": "value4_local", + "global5": "value5_local", +} + +def _create_config_file(self, env, file_name, data): + env_path = self.conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name + + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) + + +class TimeOmegaConfigLoader: + + def setup(self): + # Setup temporary configuration directory with sample config files + self.temp_dir = tempfile.TemporaryDirectory() + self.conf_source = Path(self.temp_dir.name) + + # Create sample config files in the temp directory + _create_config_file("base", "catalog.yml", base_catalog) + _create_config_file("local", "catalog.yml", local_catalog) + _create_config_file("base", "parameters.yml", base_params) + _create_config_file("local", "parameters.yml", local_params) + _create_config_file("base", "globals.yml", base_globals) + _create_config_file("local", "globals.yml", local_globals) + + # Instantiate the OmegaConfigLoader + self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local') + + def teardown(self): + # Cleanup temporary directory + self.temp_dir.cleanup() + + def time_loading_catalog(self): + """Benchmark the time to load the catalog""" + self.loader["catalog"] + + def time_loading_parameters(self): + """Benchmark the time to load environment-specific configuration""" + self.loader["parameters"] + + def time_loading_parameters_runtime(self): + """Benchmark the time to load parameters with runtime configuration""" + self.loader.runtime_params = {"param_6": "value_6", "param_7": "value_7"} + self.loader["parameters"] + + def time_loading_globals(self): + """Benchmark the time to load global configuration""" + self.loader["globals"] + + def time_merge_soft_strategy(self): + """Benchmark the time to load and soft-merge configurations""" + self.loader.merge_strategy = {"catalog": "soft"} + self.loader["catalog"] + +base_catalog_resolvers = { + "dataset_4": { + "type": "pandas.CSVDataset", + "filepath": "${_basepath}/data4.csv", + "versioned": True, + }, + "_basepath": "folder", +} +base_params_resolvers = { + "param_2": "${globals:global4}", + "param_3": "${my_custom_resolver:custom_resolver}", +} +def custom_resolver(value): + return f"custom_{value}" + +class TimeOmegaConfigLoaderAdvanced: + + def setup(self): + # Setup temporary configuration directory with sample config files + self.temp_dir = tempfile.TemporaryDirectory() + self.conf_source = Path(self.temp_dir.name) + custom_resolvers = {"my_custom_resolver": custom_resolver} + + base_catalog.update(base_catalog_resolvers) + base_params.update(base_params_resolvers) + + # Create sample config files in the temp directory + _create_config_file("base", "catalog.yml", base_catalog) + _create_config_file("local", "catalog.yml", local_catalog) + _create_config_file("base", "parameters.yml", base_params) + _create_config_file("local", "parameters.yml", local_params) + _create_config_file("base", "globals.yml", base_globals) + _create_config_file("local", "globals.yml", local_globals) + + # Instantiate the OmegaConfigLoader + self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local', custom_resolvers=custom_resolvers) + + def teardown(self): + # Cleanup temporary directory + self.temp_dir.cleanup() + + def time_loading_catalog(self): + """Benchmark the time to load the catalog""" + self.loader["catalog"] + + def time_loading_parameters(self): + """Benchmark the time to load environment-specific configuration""" + self.loader["parameters"] + + def time_loading_globals(self): + """Benchmark the time to load global configuration""" + self.loader["globals"] diff --git a/pyproject.toml b/pyproject.toml index 6f8e44f7ff..e9ffa3abbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,9 +114,9 @@ kedro = ["py.typed"] readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro.__version__"} -[tool.ruff.format] -exclude = ["**/templates", "features/steps/test_starter"] -docstring-code-format = true +#[tool.ruff.format] +#exclude = ["**/templates", "features/steps/test_starter"] +#docstring-code-format = true [tool.coverage.report] fail_under = 100 From 7618ac5c865f36be532889373df198564c1ecc26 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 14 Oct 2024 11:48:33 +0100 Subject: [PATCH 08/19] revert some changes Signed-off-by: Ankita Katiyar --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e9ffa3abbc..6f8e44f7ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,9 +114,9 @@ kedro = ["py.typed"] readme = {file = "README.md", content-type = "text/markdown"} version = {attr = "kedro.__version__"} -#[tool.ruff.format] -#exclude = ["**/templates", "features/steps/test_starter"] -#docstring-code-format = true +[tool.ruff.format] +exclude = ["**/templates", "features/steps/test_starter"] +docstring-code-format = true [tool.coverage.report] fail_under = 100 From 95628a3d54e93db6b9cde03ec6ccdbd6f3b55a9f Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Mon, 14 Oct 2024 15:24:35 +0100 Subject: [PATCH 09/19] Update to use larger config Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_ocl.py | 155 +++++++++++++++--------------------- 1 file changed, 65 insertions(+), 90 deletions(-) diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index a4b7dda5ed..f4bc2733b2 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -3,58 +3,37 @@ from kedro.config import OmegaConfigLoader -base_catalog = { - "dataset_1": { - "type": "pandas.CSVDataset", - "filepath": "data1.csv" - }, - "dataset_2": { - "type": "pandas.CSVDataset", - "filepath": "data2.csv" - }, - "dataset_3": { - "type": "pandas.CSVDataset", - "filepath": "data3.csv" - }, - "dataset_4": { - "type": "pandas.CSVDataset", - "filepath": "data4.csv", - "versioned": True, - }, -} -local_catalog = { - "dataset_4" : { - "filepath": "data4_local.csv", - "type": "pandas.CSVDataset", - }, - "dataset_5" : { - "filepath": "data5_local.csv", - "type": "pandas.CSVDataset", - }, -} -base_params = { - "param_1": "value_1", - "param_2": "value_2", - "param_3": "value_3", - "param_4": "value_4", -} -local_params = { - "param_4": "value_4_local", - "param_5": "value_5_local", -} -base_globals = { - "global1": "value1", - "global2": "value2", - "global3": "value3", - "global4": "value4", -} -local_globals = { - "global4": "value4_local", - "global5": "value5_local", -} - -def _create_config_file(self, env, file_name, data): - env_path = self.conf_source / env + +# Helper functions to generate sample configuration data +def generate_catalog(start_range, end_range, is_local=False, is_versioned=False, add_interpolation=False): + catalog = {} + for i in range(start_range, end_range + 1): + catalog[f"dataset_{i}"] = { + "type": "pandas.CSVDataset", + "filepath": f"data{i}{'_local' if is_local else ''}.csv" + } + if is_versioned: + catalog[f"dataset_{i}"]["versioned"] = True + if add_interpolation: + catalog[f"dataset_{i}"]["filepath"] = "${_basepath}" + catalog[f"dataset_{i}"]["filepath"] + return catalog + +def generate_params(start_range, end_range, is_local=False, add_globals=False): + if add_globals: + # Generate params with "${globals:global{i}}" + params = {f"param_{i}": f"${{globals:global_{i}}}" for i in range(start_range, end_range + 1)} + else: + # Generate params with "value_{i}" or "value_{i}_local" + params = {f"param_{i}": f"value_{i}{'_local' if is_local else ''}" for i in range(start_range, end_range + 1)} + + return params + +def generate_globals(start_range, end_range, is_local=False): + globals_dict = {f"global_{i}": f"value{i}{'_local' if is_local else ''}" for i in range(start_range, end_range + 1)} + return globals_dict + +def _create_config_file(conf_source, env, file_name, data): + env_path = conf_source / env env_path.mkdir(parents=True, exist_ok=True) file_path = env_path / file_name @@ -62,21 +41,35 @@ def _create_config_file(self, env, file_name, data): with open(file_path, "w") as f: yaml.dump(data, f) +base_catalog = generate_catalog(1, 1000, is_versioned=True) +local_catalog = generate_catalog(501, 1500, is_local=True) +base_params = generate_params(1, 1000) +local_params = generate_params(501, 1500, is_local=True) +base_globals = generate_globals(1, 1000) +local_globals = generate_globals(501, 1500, is_local=True) -class TimeOmegaConfigLoader: +base_catalog_with_interpolations = generate_catalog(1, 1000, is_versioned=True, add_interpolation=True) +base_catalog_with_interpolations.update({"_basepath": "/path/to/data"}) +local_catalog_with_interpolations = generate_catalog(501, 1500, is_local=True, add_interpolation=True) +local_catalog_with_interpolations.update({"_basepath": "/path/to/data"}) + +base_params_with_globals = generate_params(1, 100, add_globals=True) +# local_params_with_globals = generate_params(501, 1000, is_local=True, add_globals=True) + +class TimeOmegaConfigLoader: def setup(self): # Setup temporary configuration directory with sample config files self.temp_dir = tempfile.TemporaryDirectory() self.conf_source = Path(self.temp_dir.name) # Create sample config files in the temp directory - _create_config_file("base", "catalog.yml", base_catalog) - _create_config_file("local", "catalog.yml", local_catalog) - _create_config_file("base", "parameters.yml", base_params) - _create_config_file("local", "parameters.yml", local_params) - _create_config_file("base", "globals.yml", base_globals) - _create_config_file("local", "globals.yml", local_globals) + _create_config_file(self.conf_source, "base", "catalog.yml", base_catalog) + _create_config_file(self.conf_source, "local", "catalog.yml", local_catalog) + _create_config_file(self.conf_source, "base", "parameters.yml", base_params) + _create_config_file(self.conf_source, "local", "parameters.yml", local_params) + _create_config_file(self.conf_source, "base", "globals.yml", base_globals) + _create_config_file(self.conf_source, "local", "globals.yml", local_globals) # Instantiate the OmegaConfigLoader self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local') @@ -93,56 +86,36 @@ def time_loading_parameters(self): """Benchmark the time to load environment-specific configuration""" self.loader["parameters"] - def time_loading_parameters_runtime(self): - """Benchmark the time to load parameters with runtime configuration""" - self.loader.runtime_params = {"param_6": "value_6", "param_7": "value_7"} - self.loader["parameters"] - def time_loading_globals(self): """Benchmark the time to load global configuration""" self.loader["globals"] + def time_loading_parameters_runtime(self): + """Benchmark the time to load parameters with runtime configuration""" + self.loader.runtime_params = generate_params(2001, 2002) + self.loader["parameters"] + def time_merge_soft_strategy(self): """Benchmark the time to load and soft-merge configurations""" self.loader.merge_strategy = {"catalog": "soft"} self.loader["catalog"] -base_catalog_resolvers = { - "dataset_4": { - "type": "pandas.CSVDataset", - "filepath": "${_basepath}/data4.csv", - "versioned": True, - }, - "_basepath": "folder", -} -base_params_resolvers = { - "param_2": "${globals:global4}", - "param_3": "${my_custom_resolver:custom_resolver}", -} -def custom_resolver(value): - return f"custom_{value}" class TimeOmegaConfigLoaderAdvanced: - def setup(self): # Setup temporary configuration directory with sample config files self.temp_dir = tempfile.TemporaryDirectory() self.conf_source = Path(self.temp_dir.name) - custom_resolvers = {"my_custom_resolver": custom_resolver} - - base_catalog.update(base_catalog_resolvers) - base_params.update(base_params_resolvers) # Create sample config files in the temp directory - _create_config_file("base", "catalog.yml", base_catalog) - _create_config_file("local", "catalog.yml", local_catalog) - _create_config_file("base", "parameters.yml", base_params) - _create_config_file("local", "parameters.yml", local_params) - _create_config_file("base", "globals.yml", base_globals) - _create_config_file("local", "globals.yml", local_globals) + _create_config_file(self.conf_source, "base", "catalog.yml", base_catalog_with_interpolations) + _create_config_file(self.conf_source, "local", "catalog.yml", local_catalog_with_interpolations) + _create_config_file(self.conf_source, "base", "parameters.yml", base_params_with_globals) + _create_config_file(self.conf_source, "base", "globals.yml", base_globals) + _create_config_file(self.conf_source, "local", "globals.yml", local_globals) # Instantiate the OmegaConfigLoader - self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local', custom_resolvers=custom_resolvers) + self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local') def teardown(self): # Cleanup temporary directory @@ -159,3 +132,5 @@ def time_loading_parameters(self): def time_loading_globals(self): """Benchmark the time to load global configuration""" self.loader["globals"] + + From b5b9bf523164926b883cb9227a7702f1e19afcaa Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Tue, 15 Oct 2024 11:18:20 +0100 Subject: [PATCH 10/19] Update functions and docstrings Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_ocl.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index f4bc2733b2..4b866f9be1 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -5,7 +5,7 @@ # Helper functions to generate sample configuration data -def generate_catalog(start_range, end_range, is_local=False, is_versioned=False, add_interpolation=False): +def _generate_catalog(start_range, end_range, is_local=False, is_versioned=False, add_interpolation=False): catalog = {} for i in range(start_range, end_range + 1): catalog[f"dataset_{i}"] = { @@ -18,7 +18,7 @@ def generate_catalog(start_range, end_range, is_local=False, is_versioned=False, catalog[f"dataset_{i}"]["filepath"] = "${_basepath}" + catalog[f"dataset_{i}"]["filepath"] return catalog -def generate_params(start_range, end_range, is_local=False, add_globals=False): +def _generate_params(start_range, end_range, is_local=False, add_globals=False): if add_globals: # Generate params with "${globals:global{i}}" params = {f"param_{i}": f"${{globals:global_{i}}}" for i in range(start_range, end_range + 1)} @@ -28,7 +28,7 @@ def generate_params(start_range, end_range, is_local=False, add_globals=False): return params -def generate_globals(start_range, end_range, is_local=False): +def _generate_globals(start_range, end_range, is_local=False): globals_dict = {f"global_{i}": f"value{i}{'_local' if is_local else ''}" for i in range(start_range, end_range + 1)} return globals_dict @@ -41,20 +41,19 @@ def _create_config_file(conf_source, env, file_name, data): with open(file_path, "w") as f: yaml.dump(data, f) -base_catalog = generate_catalog(1, 1000, is_versioned=True) -local_catalog = generate_catalog(501, 1500, is_local=True) -base_params = generate_params(1, 1000) -local_params = generate_params(501, 1500, is_local=True) -base_globals = generate_globals(1, 1000) -local_globals = generate_globals(501, 1500, is_local=True) +base_catalog = _generate_catalog(1, 1000, is_versioned=True) +local_catalog = _generate_catalog(501, 1500, is_local=True) +base_params = _generate_params(1, 1000) +local_params = _generate_params(501, 1500, is_local=True) +base_globals = _generate_globals(1, 1000) +local_globals = _generate_globals(501, 1500, is_local=True) -base_catalog_with_interpolations = generate_catalog(1, 1000, is_versioned=True, add_interpolation=True) +base_catalog_with_interpolations = _generate_catalog(1, 1000, is_versioned=True, add_interpolation=True) base_catalog_with_interpolations.update({"_basepath": "/path/to/data"}) -local_catalog_with_interpolations = generate_catalog(501, 1500, is_local=True, add_interpolation=True) +local_catalog_with_interpolations = _generate_catalog(501, 1500, is_local=True, add_interpolation=True) local_catalog_with_interpolations.update({"_basepath": "/path/to/data"}) -base_params_with_globals = generate_params(1, 100, add_globals=True) -# local_params_with_globals = generate_params(501, 1000, is_local=True, add_globals=True) +base_params_with_globals = _generate_params(1, 100, add_globals=True) class TimeOmegaConfigLoader: @@ -83,7 +82,7 @@ def time_loading_catalog(self): self.loader["catalog"] def time_loading_parameters(self): - """Benchmark the time to load environment-specific configuration""" + """Benchmark the time to load the parameters""" self.loader["parameters"] def time_loading_globals(self): @@ -92,7 +91,7 @@ def time_loading_globals(self): def time_loading_parameters_runtime(self): """Benchmark the time to load parameters with runtime configuration""" - self.loader.runtime_params = generate_params(2001, 2002) + self.loader.runtime_params = _generate_params(2001, 2002) self.loader["parameters"] def time_merge_soft_strategy(self): @@ -112,7 +111,6 @@ def setup(self): _create_config_file(self.conf_source, "local", "catalog.yml", local_catalog_with_interpolations) _create_config_file(self.conf_source, "base", "parameters.yml", base_params_with_globals) _create_config_file(self.conf_source, "base", "globals.yml", base_globals) - _create_config_file(self.conf_source, "local", "globals.yml", local_globals) # Instantiate the OmegaConfigLoader self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local') @@ -126,11 +124,7 @@ def time_loading_catalog(self): self.loader["catalog"] def time_loading_parameters(self): - """Benchmark the time to load environment-specific configuration""" + """Benchmark the time to load parameters with global interpolation""" self.loader["parameters"] - def time_loading_globals(self): - """Benchmark the time to load global configuration""" - self.loader["globals"] - From 9737847954892c5cd4822982d1cdcf5844fb2b15 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Tue, 15 Oct 2024 15:52:24 +0100 Subject: [PATCH 11/19] Add performance tests for DataCatalog Signed-off-by: Ankita Katiyar --- asv.conf.json | 8 ++- benchmarks/benchmark_datacatalog.py | 79 +++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 benchmarks/benchmark_datacatalog.py diff --git a/asv.conf.json b/asv.conf.json index 2cfcd3a057..b61a6c58a3 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -8,5 +8,11 @@ "environment_type": "virtualenv", "show_commit_url": "http://github.com/kedro-org/kedro/commit/", "results_dir": ".asv/results", - "html_dir": ".asv/html" + "html_dir": ".asv/html", + "matrix": { + "req": { + "kedro-datasets": [], + "pandas": [] + } + } } diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py new file mode 100644 index 0000000000..d3b12e44ea --- /dev/null +++ b/benchmarks/benchmark_datacatalog.py @@ -0,0 +1,79 @@ +import pandas as pd +from kedro_datasets.pandas import CSVDataset + +from kedro.io import DataCatalog + +base_catalog = { + f"dataset_{i}": { + "type": "pandas.CSVDataset", + "filepath": f"data_{i}.csv", + } for i in range(1, 1001) +} +# Add datasets with the same filepath for loading +base_catalog.update({ + f"dataset_load_{i}": { + "type": "pandas.CSVDataset", + "filepath": "data.csv", + } for i in range(1, 1001) +}) +# Add a factory pattern +base_catalog.update({ + "dataset_factory_{placeholder}": { + "type": "pandas.CSVDataset", + "filepath": "data_{placeholder}.csv", + } +}) + +class TimeDataCatalog: + def setup(self): + self.catalog = DataCatalog.from_config(base_catalog) + self.dataframe = pd.DataFrame({"column": [1, 2, 3]}) + self.dataframe.to_csv("data.csv", index=False) + self.datasets = { + f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001) + } + self.feed_dict = { + f"param_{i}": i for i in range(1, 1001) + } + + + def time_save(self): + """Benchmark the time to save datasets""" + for i in range(1,1001): + self.catalog.save(f"dataset_{i}", self.dataframe) + + def time_load(self): + """Benchmark the time to load datasets""" + for i in range(1,1001): + self.catalog.load(f"dataset_load_{i}") + + def time_exists(self): + """Benchmark the time to check if datasets exist""" + for i in range(1,1001): + self.catalog.exists(f"dataset_{i}") + + def time_release(self): + """Benchmark the time to release datasets""" + for i in range(1,1001): + self.catalog.release(f"dataset_{i}") + + def time_add_all(self): + """Benchmark the time to add all datasets""" + self.catalog.add_all(self.datasets) + + def time_feed_dict(self): + """Benchmark the time to add feed dict""" + self.catalog.add_feed_dict(self.feed_dict) + + def time_list(self): + """Benchmark the time to list all datasets""" + self.catalog.list() + + def time_shallow_copy(self): + """Benchmark the time to shallow copy the catalog""" + self.catalog.shallow_copy() + + def time_resolve_factory(self): + """Benchmark the time to resolve factory""" + for i in range(1,1001): + self.catalog._get_dataset(f"dataset_factory_{i}") From 3d1cad4a44713a2941e50ca16c8511349df7b3ec Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:13:22 +0100 Subject: [PATCH 12/19] Update mypy ignore messages (#4228) Signed-off-by: Ankita Katiyar --- kedro/framework/cli/utils.py | 2 +- kedro/framework/context/context.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/framework/cli/utils.py b/kedro/framework/cli/utils.py index 1b50408cc5..ca2acfab31 100644 --- a/kedro/framework/cli/utils.py +++ b/kedro/framework/cli/utils.py @@ -422,7 +422,7 @@ def find_run_command(package_name: str) -> Callable: # use run command from `kedro.framework.cli.project` from kedro.framework.cli.project import run - return run # type: ignore[no-any-return] + return run # type: ignore[return-value] # fail badly if cli.py exists, but has no `cli` in it if not hasattr(project_cli, "cli"): raise KedroCliError(f"Cannot load commands from {package_name}.cli") diff --git a/kedro/framework/context/context.py b/kedro/framework/context/context.py index 5c14cbae38..0b44056374 100644 --- a/kedro/framework/context/context.py +++ b/kedro/framework/context/context.py @@ -207,7 +207,7 @@ def params(self) -> dict[str, Any]: # Merge nested structures params = OmegaConf.merge(params, self._extra_params) - return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[no-any-return] + return OmegaConf.to_container(params) if OmegaConf.is_config(params) else params # type: ignore[return-value] def _get_catalog( self, From 9fc67118060075d2d52c4f3596cc5e36f7810dd9 Mon Sep 17 00:00:00 2001 From: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:48:45 +0100 Subject: [PATCH 13/19] Revise Kedro project structure docs (#4208) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update project structure docs --------- Signed-off-by: Dmitry Sorokin Signed-off-by: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Co-authored-by: Juan Luis Cano Rodríguez Signed-off-by: Ankita Katiyar --- docs/source/get_started/kedro_concepts.md | 67 ++++++++++++++++++----- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/docs/source/get_started/kedro_concepts.md b/docs/source/get_started/kedro_concepts.md index ffe602a7e2..44f54ac4d8 100644 --- a/docs/source/get_started/kedro_concepts.md +++ b/docs/source/get_started/kedro_concepts.md @@ -63,20 +63,53 @@ The Kedro Data Catalog is the registry of all data sources that the project can One of the main advantages of working with Kedro projects is that they follow a default template that makes collaboration straightforward. Kedro uses semantic naming to set up a default project with specific folders to store datasets, notebooks, configuration and source code. We advise you to retain the default Kedro project structure to make it easy to share your projects with other Kedro users, although you can adapt the folder structure if you need to. -The default Kedro project structure is as follows: +Starting from Kedro 0.19, when you create a new project with `kedro new`, you can customise the structure by selecting which tools to include. Depending on your choices, the resulting structure may vary. Below, we outline the default project structure when all tools are selected and give an example with no tools selected. + +### Default Kedro project structure (all tools selected) + +If you select all tools during project creation, your project structure will look like this: + +``` +project-dir # Parent directory of the template +├── conf # Project configuration files +├── data # Local project data (not committed to version control) +├── docs # Project documentation +├── notebooks # Project-related Jupyter notebooks (can be used for experimental code before moving the code to src) +├── src # Project source code +├── tests # Folder containing unit and integration tests +├── .gitignore # Hidden file that prevents staging of unnecessary files to `git` +├── pyproject.toml # Identifies the project root and contains configuration information +├── README.md # Project README +├── requirements.txt # Project dependencies file +``` + +### Example Kedro project structure (no tools selected) + +If you select no tools, the resulting structure will be simpler: ``` -project-dir # Parent directory of the template -├── .gitignore # Hidden file that prevents staging of unnecessary files to `git` -├── conf # Project configuration files -├── data # Local project data (not committed to version control) -├── docs # Project documentation -├── notebooks # Project-related Jupyter notebooks (can be used for experimental code before moving the code to src) -├── pyproject.toml # Identifies the project root and contains configuration information -├── README.md # Project README -└── src # Project source code +project-dir # Parent directory of the template +├── conf # Project configuration files +├── notebooks # Project-related Jupyter notebooks (can be used for experimental code before moving the code to src) +├── src # Project source code +├── .gitignore # Hidden file that prevents staging of unnecessary files to `git` +├── pyproject.toml # Identifies the project root and contains configuration information +├── README.md # Project README +├── requirements.txt # Project dependencies file ``` +### Tool selection and resulting structure + +During `kedro new`, you can select which [tools to include in your project](../starters/new_project_tools.md). Each tool adds specific files or folders to the project structure: + +- **Lint (Ruff)**: Modifies the `pyproject.toml` file to include Ruff configuration settings for linting. It sets up `ruff` under `[tool.ruff]`, defines options like line length, selected rules, and ignored rules, and includes `ruff` as an optional `dev` dependency. +- **Test (Pytest)**: Adds a `tests` folder for storing unit and integration tests, helping to maintain code quality and ensuring that changes in the codebase do not introduce bugs. For more information about testing in Kedro, visit the [Automated Testing Guide](../development/automated_testing.md). +- **Log**: Allows specific logging configurations by including a `logging.yml` file inside the `conf` folder. For more information about logging customisation in Kedro, visit the [Logging Customisation Guide](../logging/index.md). +- **Docs (Sphinx)**: Adds a `docs` folder with a Sphinx documentation setup. This folder is typically used to generate technical documentation for the project. +- **Data Folder**: Adds a `data` folder structure for managing project data. The `data` folder contains multiple subfolders to store project data. We recommend you put raw data into `raw` and move processed data to other subfolders, as outlined [in this data engineering article](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71). +- **PySpark**: Adds PySpark-specific configuration files. +- **Kedro-Viz**: Adds Kedro's native visualisation tool with [experiment tracking setup.](https://docs.kedro.org/projects/kedro-viz/en/stable/experiment_tracking.html) + ### `conf` The `conf` folder contains two subfolders for storing configuration information: `base` and `local`. @@ -88,7 +121,7 @@ Use the `base` subfolder for project-specific settings to share across different The folder contains three files for the example, but you can add others as you require: - `catalog.yml` - [Configures the Data Catalog](../data/data_catalog.md#use-the-data-catalog-within-kedro-configuration) with the file paths and load/save configuration needed for different datasets -- `logging.yml` - Uses Python's default [`logging`](https://docs.python.org/3/library/logging.html) library to set up logging +- `logging.yml` - Uses Python's default [`logging`](https://docs.python.org/3/library/logging.html) library to set up logging (only added if the Log tool is selected). - `parameters.yml` - Allows you to define parameters for machine learning experiments, for example, train/test split and the number of iterations #### `conf/local` @@ -99,10 +132,14 @@ Use the `local` subfolder for **settings that should not be shared**, such as ac By default, Kedro creates one file, `credentials.yml`, in `conf/local`. -### `data` - -The `data` folder contains multiple subfolders to store project data. We recommend you put raw data into `raw` and move processed data to other subfolders according to the [commonly accepted data engineering convention](https://towardsdatascience.com/the-importance-of-layered-thinking-in-data-engineering-a09f685edc71). - ### `src` This subfolder contains the project's source code. + +### Customising your project structure + +While the default Kedro structure is recommended for collaboration and standardisation, it is possible to adapt the folder structure if necessary. This flexibility allows you to tailor the project to your needs while maintaining a consistent and recognisable structure. + +The only technical requirement when organising code is that the `pipeline_registry.py` and `settings.py` files must remain in the `/src/` directory, where they are created by default. + +The `pipeline_registry.py` file must include a `register_pipelines()` function that returns a `dict[str, Pipeline]`, which maps pipeline names to their corresponding `Pipeline` objects. From b6587e0b0dde727fc1542530c133541c7cae879f Mon Sep 17 00:00:00 2001 From: Hyewon Choi <76198373+hyew0nChoi@users.noreply.github.com> Date: Tue, 15 Oct 2024 23:20:42 +0900 Subject: [PATCH 14/19] Update CLI autocompletion docs with new Click syntax (#4213) * Update CLI autocompletion docs with new Click syntax Updated the autocompletion setup instructions for Bash, Zsh, and Fish shells to reflect the latest Click 8.1 syntax. Changed Fish shell completion script path to ~/.config/fish/completions/kedro.fish for correct placement. Signed-off-by: hyew0nChoi Signed-off-by: Ankita Katiyar --- RELEASE.md | 4 ++++ docs/source/development/commands_reference.md | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 59cace8a36..a5e34a6ba8 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -4,7 +4,10 @@ ## Bug fixes and other changes ## Breaking changes to the API ## Documentation changes +* Updated CLI autocompletion docs with new Click syntax. + ## Community contributions +* [Hyewon Choi](https://github.com/hyew0nChoi) # Release 0.19.9 @@ -38,6 +41,7 @@ * Fix logo on PyPI page. * Minor language/styling updates. + ## Community contributions * [Puneet](https://github.com/puneeter) * [ethanknights](https://github.com/ethanknights) diff --git a/docs/source/development/commands_reference.md b/docs/source/development/commands_reference.md index 12a90371f7..d66d4ffcc6 100644 --- a/docs/source/development/commands_reference.md +++ b/docs/source/development/commands_reference.md @@ -16,7 +16,7 @@ echo $0 Add the following to your ~/.bashrc (or just run it on the command line): ```bash -eval "$(_KEDRO_COMPLETE=source kedro)" +eval "$(_KEDRO_COMPLETE=bash_source kedro)" ``` @@ -26,7 +26,7 @@ eval "$(_KEDRO_COMPLETE=source kedro)" Add the following to ~/.zshrc: ```bash -eval "$(_KEDRO_COMPLETE=source_zsh kedro)" +eval "$(_KEDRO_COMPLETE=zsh_source kedro)" ``` @@ -36,7 +36,7 @@ eval "$(_KEDRO_COMPLETE=source_zsh kedro)" Add the following to ~/.config/fish/completions/foo-bar.fish: ```bash -eval (env _KEDRO_COMPLETE=source_fish kedro) +eval (env _KEDRO_COMPLETE=fish_source kedro) ``` From 062aba3534631a83b1b3fa83c8845c8cfd47fe78 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:51:05 +0000 Subject: [PATCH 15/19] Bump import-linter from 2.0 to 2.1 (#4226) Bumps [import-linter](https://github.com/seddonym/import-linter) from 2.0 to 2.1. - [Changelog](https://github.com/seddonym/import-linter/blob/master/CHANGELOG.rst) - [Commits](https://github.com/seddonym/import-linter/compare/v2.0...v2.1) --- updated-dependencies: - dependency-name: import-linter dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Ankita Katiyar --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6f8e44f7ff..97124a5813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ dynamic = ["readme", "version"] test = [ "behave==1.2.6", "coverage[toml]", - "import-linter==2.0", + "import-linter==2.1", "ipylab>=1.0.0", "ipython~=8.10", "jupyterlab_server>=2.11.1", From 56aefae77e13dfd93e0b8c1fddb6fde0812fe1cb Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Wed, 16 Oct 2024 15:10:47 +0100 Subject: [PATCH 16/19] Performance test for `OmegaConfigLoader` (#4225) * first pass doesn't work yet Signed-off-by: Ankita Katiyar * Update ocl tests Signed-off-by: Ankita Katiyar * revert some changes Signed-off-by: Ankita Katiyar * Update to use larger config Signed-off-by: Ankita Katiyar * Update functions and docstrings Signed-off-by: Ankita Katiyar * lint Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_ocl.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 4b866f9be1..5c38b61901 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -126,5 +126,3 @@ def time_loading_catalog(self): def time_loading_parameters(self): """Benchmark the time to load parameters with global interpolation""" self.loader["parameters"] - - From f981b9bbe4969af30226dcce9c8e6c43907d227a Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Wed, 16 Oct 2024 17:40:49 +0100 Subject: [PATCH 17/19] Add a test for init and fix indent Signed-off-by: Ankita Katiyar --- ...rtualenv-py3.11-kedro-datasets-pandas.json | 1 + .asv/results/M-WFLM6NH6G5/machine.json | 9 + .asv/results/benchmarks.json | 273 ++++++++++++++++++ benchmarks/benchmark_datacatalog.py | 3 + benchmarks/benchmark_ocl.py | 12 +- 5 files changed, 292 insertions(+), 6 deletions(-) create mode 100644 .asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json create mode 100644 .asv/results/M-WFLM6NH6G5/machine.json create mode 100644 .asv/results/benchmarks.json diff --git a/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json b/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json new file mode 100644 index 0000000000..b7dff2c438 --- /dev/null +++ b/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json @@ -0,0 +1 @@ +{"commit_hash": "f594c8bcd43100b216ac104d1e670ca4d5783096", "env_name": "virtualenv-py3.11-kedro-datasets-pandas", "date": 1729087847000, "params": {"arch": "arm64", "cpu": "Apple M1 Max", "machine": "M-WFLM6NH6G5", "num_cpu": "10", "os": "Darwin 23.6.0", "ram": "34359738368", "python": "3.11", "kedro-datasets": "", "pandas": ""}, "python": "3.11", "requirements": {"kedro-datasets": "", "pandas": ""}, "env_vars": {}, "result_columns": ["result", "params", "version", "started_at", "duration", "stats_ci_99_a", "stats_ci_99_b", "stats_q_25", "stats_q_75", "stats_number", "stats_repeat", "samples", "profile"], "results": {"benchmark_datacatalog.TimeDataCatalog.time_add_all": [[0.04391904201474972], [], "f470854de9a319e47c6bab999cdb3c7662341b063447a6866500b2e70b2c5ed8", 1729096807842, 1.0691, [-Infinity], [Infinity], [0.043919], [0.043919], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_exists": [[0.026259125006617978], [], "9bbce85f01a2cfbb5569bc2ba076dd22b662d5d17db4901cd5269d14dbce3ea6", 1729096808911, 0.93221, [-Infinity], [Infinity], [0.026259], [0.026259], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_feed_dict": [[0.034821834007743746], [], "0101ab38b04d7b27eda18738a82f5f06e48604c6e91d0e10bae33327bb588f69", 1729096809843, 0.8628, [-Infinity], [Infinity], [0.034822], [0.034822], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_initialise": [[0.0948639580165036], [], "9e460ed25ea64f63f905b3f3a01a817e5daa00c81390b0cdfc25fbad0ae85ea6", 1729096810706, 1.0807, [-Infinity], [Infinity], [0.094864], [0.094864], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_list": [[1.4999997802078724e-05], [], "4a7ae456f2349941bdbc595b3919284633da1da166cf1394660a0399ec618687", 1729096811787, 0.82309, [-Infinity], [Infinity], [1.5e-05], [1.5e-05], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_load": [[0.20476879199850373], [], "96bd6914ea6ed937ae958177afd17861ccf3ab1481a5d4d2ed8002dbc5d1131e", 1729096812610, 0.86483, [-Infinity], [Infinity], [0.20477], [0.20477], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_release": [[0.00888583398773335], [], "fa49ed3249b0319f92a7d6309f2a58ed8595c86141b16768cd575326a2d28d77", 1729096813475, 0.76429, [-Infinity], [Infinity], [0.0088858], [0.0088858], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory": [[0.09763062497950159], [], "c051d31d513ef455328bb051eafacb1cc06d9f84dd613ee2e0bee3440bbab467", 1729096814240, 0.83693, [-Infinity], [Infinity], [0.097631], [0.097631], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_save": [[0.5207992079958785], [], "4ea897eb28bd91fc7cf8da6e2679bf608b909c8db9ebdffd97f3bf19b275a809", 1729096815077, 1.6094, [-Infinity], [Infinity], [0.5208], [0.5208], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy": [[0.031129041977692395], [], "64ead39024c492d18b91a21c23a9d3f1323533cd37bd53934d54701ecb259762", 1729096816687, 0.68384, [-Infinity], [Infinity], [0.031129], [0.031129], [1], [1]], "benchmark_dummy.TimeSuite.time_keys": [[3.2910029403865337e-06], [], "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", 1729096817371, 0.18749, [-Infinity], [Infinity], [3.291e-06], [3.291e-06], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog": [[0.4770808340108488], [], "3ccff2348faeaf3038548994686b45deeaa3c7c46df2270a8a1f697e7401ae5a", 1729096817558, 1.0252, [-Infinity], [Infinity], [0.47708], [0.47708], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals": [[0.11477499999455176], [], "d42dda2b001097642dc790de01ab15e3f1f11426f6bfc5affbc1c658248f32be", 1729096818583, 0.75575, [-Infinity], [Infinity], [0.11477], [0.11477], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters": [[0.12353595800232142], [], "3187d47ad3445bdf83439512e124e3cde01f0503a3ffa7db9ca7a02e6bc2f7f2", 1729096819339, 0.78861, [-Infinity], [Infinity], [0.12354], [0.12354], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime": [[0.13527949998388067], [], "153be6afe75261c83d15bbc165c10b98af15d3489c722c0f7f8e5c0ce3ca2d59", 1729096820128, 0.77955, [-Infinity], [Infinity], [0.13528], [0.13528], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": [[0.7280506670067552], [], "317897f43311426ea9b688e3019361eb5bb1f61f60eca4f763d7a8ec38265ea2", 1729096820908, 1.4309, [-Infinity], [Infinity], [0.72805], [0.72805], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog": [[0.6754177919938229], [], "5499c39a6750c5d527f1a3e8a747fdd5b3128af31640d9d7ee9c72be261e344a", 1729096822339, 1.3692, [-Infinity], [Infinity], [0.67542], [0.67542], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters": [[1.9011982079828158], [], "f74ef4eead7a35df856006dbf9e1b72b61ba36b34767525f55bf8c5eabb343f1", 1729096823708, 2.5663, [-Infinity], [Infinity], [1.9012], [1.9012], [1], [1]]}, "durations": {}, "version": 2} \ No newline at end of file diff --git a/.asv/results/M-WFLM6NH6G5/machine.json b/.asv/results/M-WFLM6NH6G5/machine.json new file mode 100644 index 0000000000..3fe4186a75 --- /dev/null +++ b/.asv/results/M-WFLM6NH6G5/machine.json @@ -0,0 +1,9 @@ +{ + "arch": "arm64", + "cpu": "Apple M1 Max", + "machine": "M-WFLM6NH6G5", + "num_cpu": "10", + "os": "Darwin 23.6.0", + "ram": "34359738368", + "version": 1 +} \ No newline at end of file diff --git a/.asv/results/benchmarks.json b/.asv/results/benchmarks.json new file mode 100644 index 0000000000..347fbe8942 --- /dev/null +++ b/.asv/results/benchmarks.json @@ -0,0 +1,273 @@ +{ + "benchmark_datacatalog.TimeDataCatalog.time_add_all": { + "code": "class TimeDataCatalog:\n def time_add_all(self):\n \"\"\"Benchmark the time to add all datasets\"\"\"\n self.catalog.add_all(self.datasets)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_add_all", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "f470854de9a319e47c6bab999cdb3c7662341b063447a6866500b2e70b2c5ed8", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_exists": { + "code": "class TimeDataCatalog:\n def time_exists(self):\n \"\"\"Benchmark the time to check if datasets exist\"\"\"\n for i in range(1,1001):\n self.catalog.exists(f\"dataset_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_exists", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "9bbce85f01a2cfbb5569bc2ba076dd22b662d5d17db4901cd5269d14dbce3ea6", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_feed_dict": { + "code": "class TimeDataCatalog:\n def time_feed_dict(self):\n \"\"\"Benchmark the time to add feed dict\"\"\"\n self.catalog.add_feed_dict(self.feed_dict)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_feed_dict", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "0101ab38b04d7b27eda18738a82f5f06e48604c6e91d0e10bae33327bb588f69", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_initialise": { + "code": "class TimeDataCatalog:\n def time_initialise(self):\n \"\"\"Benchmark the time to initialise the catalog\"\"\"\n DataCatalog.from_config(base_catalog)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_initialise", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "9e460ed25ea64f63f905b3f3a01a817e5daa00c81390b0cdfc25fbad0ae85ea6", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_list": { + "code": "class TimeDataCatalog:\n def time_list(self):\n \"\"\"Benchmark the time to list all datasets\"\"\"\n self.catalog.list()\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_list", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "4a7ae456f2349941bdbc595b3919284633da1da166cf1394660a0399ec618687", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_load": { + "code": "class TimeDataCatalog:\n def time_load(self):\n \"\"\"Benchmark the time to load datasets\"\"\"\n for i in range(1,1001):\n self.catalog.load(f\"dataset_load_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_load", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "96bd6914ea6ed937ae958177afd17861ccf3ab1481a5d4d2ed8002dbc5d1131e", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_release": { + "code": "class TimeDataCatalog:\n def time_release(self):\n \"\"\"Benchmark the time to release datasets\"\"\"\n for i in range(1,1001):\n self.catalog.release(f\"dataset_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_release", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "fa49ed3249b0319f92a7d6309f2a58ed8595c86141b16768cd575326a2d28d77", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory": { + "code": "class TimeDataCatalog:\n def time_resolve_factory(self):\n \"\"\"Benchmark the time to resolve factory\"\"\"\n for i in range(1,1001):\n self.catalog._get_dataset(f\"dataset_factory_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "c051d31d513ef455328bb051eafacb1cc06d9f84dd613ee2e0bee3440bbab467", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_save": { + "code": "class TimeDataCatalog:\n def time_save(self):\n \"\"\"Benchmark the time to save datasets\"\"\"\n for i in range(1,1001):\n self.catalog.save(f\"dataset_{i}\", self.dataframe)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_save", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "4ea897eb28bd91fc7cf8da6e2679bf608b909c8db9ebdffd97f3bf19b275a809", + "warmup_time": -1 + }, + "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy": { + "code": "class TimeDataCatalog:\n def time_shallow_copy(self):\n \"\"\"Benchmark the time to shallow copy the catalog\"\"\"\n self.catalog.shallow_copy()\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", + "min_run_count": 2, + "name": "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "64ead39024c492d18b91a21c23a9d3f1323533cd37bd53934d54701ecb259762", + "warmup_time": -1 + }, + "benchmark_dummy.TimeSuite.time_keys": { + "code": "class TimeSuite:\n def time_keys(self):\n for key in self.d.keys():\n pass\n\n def setup(self):\n self.d = {}\n for x in range(500):\n self.d[x] = None", + "min_run_count": 2, + "name": "benchmark_dummy.TimeSuite.time_keys", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_catalog(self):\n \"\"\"Benchmark the time to load the catalog\"\"\"\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "3ccff2348faeaf3038548994686b45deeaa3c7c46df2270a8a1f697e7401ae5a", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_globals(self):\n \"\"\"Benchmark the time to load global configuration\"\"\"\n self.loader[\"globals\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "d42dda2b001097642dc790de01ab15e3f1f11426f6bfc5affbc1c658248f32be", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_parameters(self):\n \"\"\"Benchmark the time to load the parameters\"\"\"\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "3187d47ad3445bdf83439512e124e3cde01f0503a3ffa7db9ca7a02e6bc2f7f2", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime": { + "code": "class TimeOmegaConfigLoader:\n def time_loading_parameters_runtime(self):\n \"\"\"Benchmark the time to load parameters with runtime configuration\"\"\"\n self.loader.runtime_params = _generate_params(2001, 2002)\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "153be6afe75261c83d15bbc165c10b98af15d3489c722c0f7f8e5c0ce3ca2d59", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": { + "code": "class TimeOmegaConfigLoader:\n def time_merge_soft_strategy(self):\n \"\"\"Benchmark the time to load and soft-merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"soft\"}\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "317897f43311426ea9b688e3019361eb5bb1f61f60eca4f763d7a8ec38265ea2", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog": { + "code": "class TimeOmegaConfigLoaderAdvanced:\n def time_loading_catalog(self):\n \"\"\"Benchmark the time to load the catalog\"\"\"\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params_with_globals)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "5499c39a6750c5d527f1a3e8a747fdd5b3128af31640d9d7ee9c72be261e344a", + "warmup_time": -1 + }, + "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters": { + "code": "class TimeOmegaConfigLoaderAdvanced:\n def time_loading_parameters(self):\n \"\"\"Benchmark the time to load parameters with global interpolation\"\"\"\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params_with_globals)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", + "min_run_count": 2, + "name": "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters", + "number": 0, + "param_names": [], + "params": [], + "repeat": 0, + "rounds": 2, + "sample_time": 0.01, + "type": "time", + "unit": "seconds", + "version": "f74ef4eead7a35df856006dbf9e1b72b61ba36b34767525f55bf8c5eabb343f1", + "warmup_time": -1 + }, + "version": 2 +} \ No newline at end of file diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py index d3b12e44ea..7c1a73b6a9 100644 --- a/benchmarks/benchmark_datacatalog.py +++ b/benchmarks/benchmark_datacatalog.py @@ -36,6 +36,9 @@ def setup(self): f"param_{i}": i for i in range(1, 1001) } + def time_initialise(self): + """Benchmark the time to initialise the catalog""" + DataCatalog.from_config(base_catalog) def time_save(self): """Benchmark the time to save datasets""" diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 5c38b61901..2dfd970a2e 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False): return globals_dict def _create_config_file(conf_source, env, file_name, data): - env_path = conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name + env_path = conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) base_catalog = _generate_catalog(1, 1000, is_versioned=True) local_catalog = _generate_catalog(501, 1500, is_local=True) From b8e4203abb98f4cd6bdbdfa3ba139f8cc57bfaef Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Wed, 16 Oct 2024 17:41:15 +0100 Subject: [PATCH 18/19] Revert "Add a test for init and fix indent" This reverts commit 0dbe3c79a1aeba027fb0493ca28b75c49763cfbd. Signed-off-by: Ankita Katiyar --- ...rtualenv-py3.11-kedro-datasets-pandas.json | 1 - .asv/results/M-WFLM6NH6G5/machine.json | 9 - .asv/results/benchmarks.json | 273 ------------------ benchmarks/benchmark_datacatalog.py | 3 - benchmarks/benchmark_ocl.py | 12 +- 5 files changed, 6 insertions(+), 292 deletions(-) delete mode 100644 .asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json delete mode 100644 .asv/results/M-WFLM6NH6G5/machine.json delete mode 100644 .asv/results/benchmarks.json diff --git a/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json b/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json deleted file mode 100644 index b7dff2c438..0000000000 --- a/.asv/results/M-WFLM6NH6G5/f594c8bc-virtualenv-py3.11-kedro-datasets-pandas.json +++ /dev/null @@ -1 +0,0 @@ -{"commit_hash": "f594c8bcd43100b216ac104d1e670ca4d5783096", "env_name": "virtualenv-py3.11-kedro-datasets-pandas", "date": 1729087847000, "params": {"arch": "arm64", "cpu": "Apple M1 Max", "machine": "M-WFLM6NH6G5", "num_cpu": "10", "os": "Darwin 23.6.0", "ram": "34359738368", "python": "3.11", "kedro-datasets": "", "pandas": ""}, "python": "3.11", "requirements": {"kedro-datasets": "", "pandas": ""}, "env_vars": {}, "result_columns": ["result", "params", "version", "started_at", "duration", "stats_ci_99_a", "stats_ci_99_b", "stats_q_25", "stats_q_75", "stats_number", "stats_repeat", "samples", "profile"], "results": {"benchmark_datacatalog.TimeDataCatalog.time_add_all": [[0.04391904201474972], [], "f470854de9a319e47c6bab999cdb3c7662341b063447a6866500b2e70b2c5ed8", 1729096807842, 1.0691, [-Infinity], [Infinity], [0.043919], [0.043919], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_exists": [[0.026259125006617978], [], "9bbce85f01a2cfbb5569bc2ba076dd22b662d5d17db4901cd5269d14dbce3ea6", 1729096808911, 0.93221, [-Infinity], [Infinity], [0.026259], [0.026259], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_feed_dict": [[0.034821834007743746], [], "0101ab38b04d7b27eda18738a82f5f06e48604c6e91d0e10bae33327bb588f69", 1729096809843, 0.8628, [-Infinity], [Infinity], [0.034822], [0.034822], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_initialise": [[0.0948639580165036], [], "9e460ed25ea64f63f905b3f3a01a817e5daa00c81390b0cdfc25fbad0ae85ea6", 1729096810706, 1.0807, [-Infinity], [Infinity], [0.094864], [0.094864], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_list": [[1.4999997802078724e-05], [], "4a7ae456f2349941bdbc595b3919284633da1da166cf1394660a0399ec618687", 1729096811787, 0.82309, [-Infinity], [Infinity], [1.5e-05], [1.5e-05], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_load": [[0.20476879199850373], [], "96bd6914ea6ed937ae958177afd17861ccf3ab1481a5d4d2ed8002dbc5d1131e", 1729096812610, 0.86483, [-Infinity], [Infinity], [0.20477], [0.20477], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_release": [[0.00888583398773335], [], "fa49ed3249b0319f92a7d6309f2a58ed8595c86141b16768cd575326a2d28d77", 1729096813475, 0.76429, [-Infinity], [Infinity], [0.0088858], [0.0088858], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory": [[0.09763062497950159], [], "c051d31d513ef455328bb051eafacb1cc06d9f84dd613ee2e0bee3440bbab467", 1729096814240, 0.83693, [-Infinity], [Infinity], [0.097631], [0.097631], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_save": [[0.5207992079958785], [], "4ea897eb28bd91fc7cf8da6e2679bf608b909c8db9ebdffd97f3bf19b275a809", 1729096815077, 1.6094, [-Infinity], [Infinity], [0.5208], [0.5208], [1], [1]], "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy": [[0.031129041977692395], [], "64ead39024c492d18b91a21c23a9d3f1323533cd37bd53934d54701ecb259762", 1729096816687, 0.68384, [-Infinity], [Infinity], [0.031129], [0.031129], [1], [1]], "benchmark_dummy.TimeSuite.time_keys": [[3.2910029403865337e-06], [], "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", 1729096817371, 0.18749, [-Infinity], [Infinity], [3.291e-06], [3.291e-06], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog": [[0.4770808340108488], [], "3ccff2348faeaf3038548994686b45deeaa3c7c46df2270a8a1f697e7401ae5a", 1729096817558, 1.0252, [-Infinity], [Infinity], [0.47708], [0.47708], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals": [[0.11477499999455176], [], "d42dda2b001097642dc790de01ab15e3f1f11426f6bfc5affbc1c658248f32be", 1729096818583, 0.75575, [-Infinity], [Infinity], [0.11477], [0.11477], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters": [[0.12353595800232142], [], "3187d47ad3445bdf83439512e124e3cde01f0503a3ffa7db9ca7a02e6bc2f7f2", 1729096819339, 0.78861, [-Infinity], [Infinity], [0.12354], [0.12354], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime": [[0.13527949998388067], [], "153be6afe75261c83d15bbc165c10b98af15d3489c722c0f7f8e5c0ce3ca2d59", 1729096820128, 0.77955, [-Infinity], [Infinity], [0.13528], [0.13528], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": [[0.7280506670067552], [], "317897f43311426ea9b688e3019361eb5bb1f61f60eca4f763d7a8ec38265ea2", 1729096820908, 1.4309, [-Infinity], [Infinity], [0.72805], [0.72805], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog": [[0.6754177919938229], [], "5499c39a6750c5d527f1a3e8a747fdd5b3128af31640d9d7ee9c72be261e344a", 1729096822339, 1.3692, [-Infinity], [Infinity], [0.67542], [0.67542], [1], [1]], "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters": [[1.9011982079828158], [], "f74ef4eead7a35df856006dbf9e1b72b61ba36b34767525f55bf8c5eabb343f1", 1729096823708, 2.5663, [-Infinity], [Infinity], [1.9012], [1.9012], [1], [1]]}, "durations": {}, "version": 2} \ No newline at end of file diff --git a/.asv/results/M-WFLM6NH6G5/machine.json b/.asv/results/M-WFLM6NH6G5/machine.json deleted file mode 100644 index 3fe4186a75..0000000000 --- a/.asv/results/M-WFLM6NH6G5/machine.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "arch": "arm64", - "cpu": "Apple M1 Max", - "machine": "M-WFLM6NH6G5", - "num_cpu": "10", - "os": "Darwin 23.6.0", - "ram": "34359738368", - "version": 1 -} \ No newline at end of file diff --git a/.asv/results/benchmarks.json b/.asv/results/benchmarks.json deleted file mode 100644 index 347fbe8942..0000000000 --- a/.asv/results/benchmarks.json +++ /dev/null @@ -1,273 +0,0 @@ -{ - "benchmark_datacatalog.TimeDataCatalog.time_add_all": { - "code": "class TimeDataCatalog:\n def time_add_all(self):\n \"\"\"Benchmark the time to add all datasets\"\"\"\n self.catalog.add_all(self.datasets)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_add_all", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "f470854de9a319e47c6bab999cdb3c7662341b063447a6866500b2e70b2c5ed8", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_exists": { - "code": "class TimeDataCatalog:\n def time_exists(self):\n \"\"\"Benchmark the time to check if datasets exist\"\"\"\n for i in range(1,1001):\n self.catalog.exists(f\"dataset_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_exists", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "9bbce85f01a2cfbb5569bc2ba076dd22b662d5d17db4901cd5269d14dbce3ea6", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_feed_dict": { - "code": "class TimeDataCatalog:\n def time_feed_dict(self):\n \"\"\"Benchmark the time to add feed dict\"\"\"\n self.catalog.add_feed_dict(self.feed_dict)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_feed_dict", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "0101ab38b04d7b27eda18738a82f5f06e48604c6e91d0e10bae33327bb588f69", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_initialise": { - "code": "class TimeDataCatalog:\n def time_initialise(self):\n \"\"\"Benchmark the time to initialise the catalog\"\"\"\n DataCatalog.from_config(base_catalog)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_initialise", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "9e460ed25ea64f63f905b3f3a01a817e5daa00c81390b0cdfc25fbad0ae85ea6", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_list": { - "code": "class TimeDataCatalog:\n def time_list(self):\n \"\"\"Benchmark the time to list all datasets\"\"\"\n self.catalog.list()\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_list", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "4a7ae456f2349941bdbc595b3919284633da1da166cf1394660a0399ec618687", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_load": { - "code": "class TimeDataCatalog:\n def time_load(self):\n \"\"\"Benchmark the time to load datasets\"\"\"\n for i in range(1,1001):\n self.catalog.load(f\"dataset_load_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_load", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "96bd6914ea6ed937ae958177afd17861ccf3ab1481a5d4d2ed8002dbc5d1131e", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_release": { - "code": "class TimeDataCatalog:\n def time_release(self):\n \"\"\"Benchmark the time to release datasets\"\"\"\n for i in range(1,1001):\n self.catalog.release(f\"dataset_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_release", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "fa49ed3249b0319f92a7d6309f2a58ed8595c86141b16768cd575326a2d28d77", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory": { - "code": "class TimeDataCatalog:\n def time_resolve_factory(self):\n \"\"\"Benchmark the time to resolve factory\"\"\"\n for i in range(1,1001):\n self.catalog._get_dataset(f\"dataset_factory_{i}\")\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_resolve_factory", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "c051d31d513ef455328bb051eafacb1cc06d9f84dd613ee2e0bee3440bbab467", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_save": { - "code": "class TimeDataCatalog:\n def time_save(self):\n \"\"\"Benchmark the time to save datasets\"\"\"\n for i in range(1,1001):\n self.catalog.save(f\"dataset_{i}\", self.dataframe)\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_save", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "4ea897eb28bd91fc7cf8da6e2679bf608b909c8db9ebdffd97f3bf19b275a809", - "warmup_time": -1 - }, - "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy": { - "code": "class TimeDataCatalog:\n def time_shallow_copy(self):\n \"\"\"Benchmark the time to shallow copy the catalog\"\"\"\n self.catalog.shallow_copy()\n\n def setup(self):\n self.catalog = DataCatalog.from_config(base_catalog)\n self.dataframe = pd.DataFrame({\"column\": [1, 2, 3]})\n self.dataframe.to_csv(\"data.csv\", index=False)\n self.datasets = {\n f\"dataset_new_{i}\": CSVDataset(filepath=\"data.csv\") for i in range(1, 1001)\n }\n self.feed_dict = {\n f\"param_{i}\": i for i in range(1, 1001)\n }", - "min_run_count": 2, - "name": "benchmark_datacatalog.TimeDataCatalog.time_shallow_copy", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "64ead39024c492d18b91a21c23a9d3f1323533cd37bd53934d54701ecb259762", - "warmup_time": -1 - }, - "benchmark_dummy.TimeSuite.time_keys": { - "code": "class TimeSuite:\n def time_keys(self):\n for key in self.d.keys():\n pass\n\n def setup(self):\n self.d = {}\n for x in range(500):\n self.d[x] = None", - "min_run_count": 2, - "name": "benchmark_dummy.TimeSuite.time_keys", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "86e015a3c40c52da31e4185fff7c7176c38c5e1e1e4aba71912db0b388225191", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_catalog(self):\n \"\"\"Benchmark the time to load the catalog\"\"\"\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_catalog", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "3ccff2348faeaf3038548994686b45deeaa3c7c46df2270a8a1f697e7401ae5a", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_globals(self):\n \"\"\"Benchmark the time to load global configuration\"\"\"\n self.loader[\"globals\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_globals", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "d42dda2b001097642dc790de01ab15e3f1f11426f6bfc5affbc1c658248f32be", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_parameters(self):\n \"\"\"Benchmark the time to load the parameters\"\"\"\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "3187d47ad3445bdf83439512e124e3cde01f0503a3ffa7db9ca7a02e6bc2f7f2", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime": { - "code": "class TimeOmegaConfigLoader:\n def time_loading_parameters_runtime(self):\n \"\"\"Benchmark the time to load parameters with runtime configuration\"\"\"\n self.loader.runtime_params = _generate_params(2001, 2002)\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoader.time_loading_parameters_runtime", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "153be6afe75261c83d15bbc165c10b98af15d3489c722c0f7f8e5c0ce3ca2d59", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy": { - "code": "class TimeOmegaConfigLoader:\n def time_merge_soft_strategy(self):\n \"\"\"Benchmark the time to load and soft-merge configurations\"\"\"\n self.loader.merge_strategy = {\"catalog\": \"soft\"}\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params)\n _create_config_file(self.conf_source, \"local\", \"parameters.yml\", local_params)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n _create_config_file(self.conf_source, \"local\", \"globals.yml\", local_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoader.time_merge_soft_strategy", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "317897f43311426ea9b688e3019361eb5bb1f61f60eca4f763d7a8ec38265ea2", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog": { - "code": "class TimeOmegaConfigLoaderAdvanced:\n def time_loading_catalog(self):\n \"\"\"Benchmark the time to load the catalog\"\"\"\n self.loader[\"catalog\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params_with_globals)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_catalog", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "5499c39a6750c5d527f1a3e8a747fdd5b3128af31640d9d7ee9c72be261e344a", - "warmup_time": -1 - }, - "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters": { - "code": "class TimeOmegaConfigLoaderAdvanced:\n def time_loading_parameters(self):\n \"\"\"Benchmark the time to load parameters with global interpolation\"\"\"\n self.loader[\"parameters\"]\n\n def setup(self):\n # Setup temporary configuration directory with sample config files\n self.temp_dir = tempfile.TemporaryDirectory()\n self.conf_source = Path(self.temp_dir.name)\n \n # Create sample config files in the temp directory\n _create_config_file(self.conf_source, \"base\", \"catalog.yml\", base_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"local\", \"catalog.yml\", local_catalog_with_interpolations)\n _create_config_file(self.conf_source, \"base\", \"parameters.yml\", base_params_with_globals)\n _create_config_file(self.conf_source, \"base\", \"globals.yml\", base_globals)\n \n # Instantiate the OmegaConfigLoader\n self.loader = OmegaConfigLoader(conf_source=self.conf_source, base_env='base', default_run_env='local')", - "min_run_count": 2, - "name": "benchmark_ocl.TimeOmegaConfigLoaderAdvanced.time_loading_parameters", - "number": 0, - "param_names": [], - "params": [], - "repeat": 0, - "rounds": 2, - "sample_time": 0.01, - "type": "time", - "unit": "seconds", - "version": "f74ef4eead7a35df856006dbf9e1b72b61ba36b34767525f55bf8c5eabb343f1", - "warmup_time": -1 - }, - "version": 2 -} \ No newline at end of file diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py index 7c1a73b6a9..d3b12e44ea 100644 --- a/benchmarks/benchmark_datacatalog.py +++ b/benchmarks/benchmark_datacatalog.py @@ -36,9 +36,6 @@ def setup(self): f"param_{i}": i for i in range(1, 1001) } - def time_initialise(self): - """Benchmark the time to initialise the catalog""" - DataCatalog.from_config(base_catalog) def time_save(self): """Benchmark the time to save datasets""" diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 2dfd970a2e..5c38b61901 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False): return globals_dict def _create_config_file(conf_source, env, file_name, data): - env_path = conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name + env_path = conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) base_catalog = _generate_catalog(1, 1000, is_versioned=True) local_catalog = _generate_catalog(501, 1500, is_local=True) From 821401deea8664b0ec91ae0e378268962b844b78 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Wed, 16 Oct 2024 17:44:16 +0100 Subject: [PATCH 19/19] Add a test for init and fix indent Signed-off-by: Ankita Katiyar --- benchmarks/benchmark_datacatalog.py | 3 +++ benchmarks/benchmark_ocl.py | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py index d3b12e44ea..15de4ef310 100644 --- a/benchmarks/benchmark_datacatalog.py +++ b/benchmarks/benchmark_datacatalog.py @@ -36,6 +36,9 @@ def setup(self): f"param_{i}": i for i in range(1, 1001) } + def time_init(self): + """Benchmark the time to initialize the catalog""" + DataCatalog.from_config(base_catalog) def time_save(self): """Benchmark the time to save datasets""" diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 5c38b61901..2dfd970a2e 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False): return globals_dict def _create_config_file(conf_source, env, file_name, data): - env_path = conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name + env_path = conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) base_catalog = _generate_catalog(1, 1000, is_versioned=True) local_catalog = _generate_catalog(501, 1500, is_local=True)