diff --git a/asv.conf.json b/asv.conf.json index 2cfcd3a057..b61a6c58a3 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -8,5 +8,11 @@ "environment_type": "virtualenv", "show_commit_url": "http://github.com/kedro-org/kedro/commit/", "results_dir": ".asv/results", - "html_dir": ".asv/html" + "html_dir": ".asv/html", + "matrix": { + "req": { + "kedro-datasets": [], + "pandas": [] + } + } } diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py new file mode 100644 index 0000000000..15de4ef310 --- /dev/null +++ b/benchmarks/benchmark_datacatalog.py @@ -0,0 +1,82 @@ +import pandas as pd +from kedro_datasets.pandas import CSVDataset + +from kedro.io import DataCatalog + +base_catalog = { + f"dataset_{i}": { + "type": "pandas.CSVDataset", + "filepath": f"data_{i}.csv", + } for i in range(1, 1001) +} +# Add datasets with the same filepath for loading +base_catalog.update({ + f"dataset_load_{i}": { + "type": "pandas.CSVDataset", + "filepath": "data.csv", + } for i in range(1, 1001) +}) +# Add a factory pattern +base_catalog.update({ + "dataset_factory_{placeholder}": { + "type": "pandas.CSVDataset", + "filepath": "data_{placeholder}.csv", + } +}) + +class TimeDataCatalog: + def setup(self): + self.catalog = DataCatalog.from_config(base_catalog) + self.dataframe = pd.DataFrame({"column": [1, 2, 3]}) + self.dataframe.to_csv("data.csv", index=False) + self.datasets = { + f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001) + } + self.feed_dict = { + f"param_{i}": i for i in range(1, 1001) + } + + def time_init(self): + """Benchmark the time to initialize the catalog""" + DataCatalog.from_config(base_catalog) + + def time_save(self): + """Benchmark the time to save datasets""" + for i in range(1,1001): + self.catalog.save(f"dataset_{i}", self.dataframe) + + def time_load(self): + """Benchmark the time to load datasets""" + for i in range(1,1001): + self.catalog.load(f"dataset_load_{i}") + + def time_exists(self): + """Benchmark the time to check if datasets exist""" + for i in range(1,1001): + self.catalog.exists(f"dataset_{i}") + + def time_release(self): + """Benchmark the time to release datasets""" + for i in range(1,1001): + self.catalog.release(f"dataset_{i}") + + def time_add_all(self): + """Benchmark the time to add all datasets""" + self.catalog.add_all(self.datasets) + + def time_feed_dict(self): + """Benchmark the time to add feed dict""" + self.catalog.add_feed_dict(self.feed_dict) + + def time_list(self): + """Benchmark the time to list all datasets""" + self.catalog.list() + + def time_shallow_copy(self): + """Benchmark the time to shallow copy the catalog""" + self.catalog.shallow_copy() + + def time_resolve_factory(self): + """Benchmark the time to resolve factory""" + for i in range(1,1001): + self.catalog._get_dataset(f"dataset_factory_{i}") diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py index 5c38b61901..2dfd970a2e 100644 --- a/benchmarks/benchmark_ocl.py +++ b/benchmarks/benchmark_ocl.py @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False): return globals_dict def _create_config_file(conf_source, env, file_name, data): - env_path = conf_source / env - env_path.mkdir(parents=True, exist_ok=True) - file_path = env_path / file_name + env_path = conf_source / env + env_path.mkdir(parents=True, exist_ok=True) + file_path = env_path / file_name - import yaml - with open(file_path, "w") as f: - yaml.dump(data, f) + import yaml + with open(file_path, "w") as f: + yaml.dump(data, f) base_catalog = _generate_catalog(1, 1000, is_versioned=True) local_catalog = _generate_catalog(501, 1500, is_local=True)