From 2e950a2f2cc07f45b028c3668a692e86d5227775 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Fri, 18 Oct 2024 16:23:07 +0100 Subject: [PATCH] Performance tests for `DataCatalog` (#4230) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update index.md (#4221) Fixed an erroneous link to the Get started with Kedro - Create your first data pipeline with Kedro video. It was accidentally linked to the previous video. Signed-off-by: Greg Vaslowski <7269272+Vaslo@users.noreply.github.com> Signed-off-by: Ankita Katiyar * Bump kedro-sphinx-theme from 2024.4.0 to 2024.10.0 (#4216) * Bump kedro-sphinx-theme from 2024.4.0 to 2024.10.0 Bumps [kedro-sphinx-theme](https://github.com/kedro-org/kedro-sphinx-theme) from 2024.4.0 to 2024.10.0. - [Release notes](https://github.com/kedro-org/kedro-sphinx-theme/releases) - [Commits](https://github.com/kedro-org/kedro-sphinx-theme/compare/v2024.4.0...v2024.10.0) --- updated-dependencies: - dependency-name: kedro-sphinx-theme dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] * updated to 2024.10.2 * trigger_run * trigger_run --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Co-authored-by: rashidakanchwala Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Signed-off-by: Ankita Katiyar * Replace all instances of "data set" with "dataset" (#4211) Signed-off-by: Deepyaman Datta Signed-off-by: Ankita Katiyar * Manually created sitemap.xml for improved control over indexed docs pages (#4145) * Load manually created sitemap Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> * Add projects remove lastmod for latest Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> * Add latest for projects Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> --------- Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Co-authored-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Co-authored-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Signed-off-by: Ankita Katiyar * Bump up version to 0.19.9 (#4219) * Bump up version to 0.19.9 Signed-off-by: Laura Couto * Add placeholders to release.md Signed-off-by: Laura Couto * Update citation.cff release date Signed-off-by: Laura Couto --------- Signed-off-by: Laura Couto Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Signed-off-by: Ankita Katiyar * first pass doesn't work yet Signed-off-by: Ankita Katiyar * Update ocl tests Signed-off-by: Ankita Katiyar * revert some changes Signed-off-by: Ankita Katiyar * Update to use larger config Signed-off-by: Ankita Katiyar * Update functions and docstrings Signed-off-by: Ankita Katiyar * Add performance tests for DataCatalog Signed-off-by: Ankita Katiyar * Update mypy ignore messages (#4228) Signed-off-by: Ankita Katiyar * Revise Kedro project structure docs (#4208) * Update project structure docs --------- Signed-off-by: Dmitry Sorokin Signed-off-by: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Co-authored-by: Juan Luis Cano Rodríguez Signed-off-by: Ankita Katiyar * Update CLI autocompletion docs with new Click syntax (#4213) * Update CLI autocompletion docs with new Click syntax Updated the autocompletion setup instructions for Bash, Zsh, and Fish shells to reflect the latest Click 8.1 syntax. Changed Fish shell completion script path to ~/.config/fish/completions/kedro.fish for correct placement. Signed-off-by: hyew0nChoi Signed-off-by: Ankita Katiyar * Bump import-linter from 2.0 to 2.1 (#4226) Bumps [import-linter](https://github.com/seddonym/import-linter) from 2.0 to 2.1. - [Changelog](https://github.com/seddonym/import-linter/blob/master/CHANGELOG.rst) - [Commits](https://github.com/seddonym/import-linter/compare/v2.0...v2.1) --- updated-dependencies: - dependency-name: import-linter dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: Ankita Katiyar * Performance test for `OmegaConfigLoader` (#4225) * first pass doesn't work yet Signed-off-by: Ankita Katiyar * Update ocl tests Signed-off-by: Ankita Katiyar * revert some changes Signed-off-by: Ankita Katiyar * Update to use larger config Signed-off-by: Ankita Katiyar * Update functions and docstrings Signed-off-by: Ankita Katiyar * lint Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar * Add a test for init and fix indent Signed-off-by: Ankita Katiyar * Revert "Add a test for init and fix indent" This reverts commit 0dbe3c79a1aeba027fb0493ca28b75c49763cfbd. Signed-off-by: Ankita Katiyar * Add a test for init and fix indent Signed-off-by: Ankita Katiyar --------- Signed-off-by: Greg Vaslowski <7269272+Vaslo@users.noreply.github.com> Signed-off-by: Ankita Katiyar Signed-off-by: dependabot[bot] Signed-off-by: Deepyaman Datta Signed-off-by: Dmitry Sorokin <129520297+DmitrySorokinQB@users.noreply.github.com> Signed-off-by: Laura Couto Signed-off-by: L. R. Couto <57910428+lrcouto@users.noreply.github.com> Signed-off-by: Dmitry Sorokin Signed-off-by: Dmitry Sorokin <40151847+DimedS@users.noreply.github.com> Signed-off-by: hyew0nChoi Signed-off-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> --- asv.conf.json | 8 ++- benchmarks/benchmark_datacatalog.py | 82 +++++++++++++++++++++++++++++ benchmarks/benchmark_ocl.py | 12 ++--- 3 files changed, 95 insertions(+), 7 deletions(-) create mode 100644 benchmarks/benchmark_datacatalog.py diff --git a/asv.conf.json b/asv.conf.json index 2cfcd3a057..b61a6c58a3 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -8,5 +8,11 @@ "environment_type": "virtualenv", "show_commit_url": "http://github.com/kedro-org/kedro/commit/", "results_dir": ".asv/results", - "html_dir": ".asv/html" + "html_dir": ".asv/html", + "matrix": { + "req": { + "kedro-datasets": [], + "pandas": [] + } + } } diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py new file mode 100644 index 0000000000..15de4ef310 --- /dev/null +++ b/benchmarks/benchmark_datacatalog.py @@ -0,0 +1,82 @@ +import pandas as pd +from kedro_datasets.pandas import CSVDataset + +from kedro.io import DataCatalog + +base_catalog = { + f"dataset_{i}": { + "type": "pandas.CSVDataset", + "filepath": f"data_{i}.csv", + } for i in range(1, 1001) +} +# Add datasets with the same filepath for loading +base_catalog.update({ + f"dataset_load_{i}": { + "type": "pandas.CSVDataset", + "filepath": "data.csv", + } for i in range(1, 1001) +}) +# Add a factory pattern +base_catalog.update({ + "dataset_factory_{placeholder}": { + "type": "pandas.CSVDataset", + "filepath": "data_{placeholder}.csv", + } +}) + +class TimeDataCatalog: + def setup(self): + self.catalog = DataCatalog.from_config(base_catalog) + self.dataframe = pd.DataFrame({"column": [1, 2, 3]}) + self.dataframe.to_csv("data.csv", index=False) + self.datasets = { + f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001) + } + self.feed_dict = { + f"param_{i}": i for i in range(1, 1001) + } + + def time_init(self): + """Benchmark the time to initialize the catalog""" + DataCatalog.from_config(base_catalog) + + def time_save(self): + """Benchmark the time to save datasets""" + for i in range(1,1001): + self.catalog.save(f"dataset_{i}", self.dataframe) + + def time_load(self): + """Benchmark the time to load datasets""" + for i in range(1,1001): + self.catalog.load(f"dataset_load_{i}") + + def time_exists(self): + """Benchmark the time to check if datasets exist""" + for i in range(1,1001): + self.catalog.exists(f"dataset_{i}") + + def time_release(self): + """Benchmark the time to release datasets""" + for i in range(1,1001): + self.catalog.release(f"dataset_{i}") + + def time_add_all(self): + """Benchmark the time to add all datasets""" + self.catalog.add_all(self.datasets) + + def time_feed_dict(self): + """Benchmark the time to add feed dict""" + self.catalog.add_feed_dict(self.feed_dict) + + def time_list(self): + """Benchmark the time to list all datasets""" + self.catalog.list() + + def time_shallow_copy(self): + """Benchmark the time to shallow copy the catalog""" + self.catalog.shallow_copy() + + def time_resolve_factory(self): + """Benchmark the time to resolve factory""" + for i in range(1,1001): + self.catalog._get_dataset(f"dataset_factory_{i}") diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 5c38b61901..2dfd970a2e 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False): return globals_dict def _create_config_file(conf_source, env, file_name, data): - env_path = conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name + env_path = conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) base_catalog = _generate_catalog(1, 1000, is_versioned=True) local_catalog = _generate_catalog(501, 1500, is_local=True)