Skip to content

Commit

Permalink
Merge branch 'main' into feature/4175-catalog-dict-interface
Browse files Browse the repository at this point in the history
  • Loading branch information
ElenaKhaustova committed Oct 18, 2024
2 parents 6650a83 + 2e950a2 commit 8df7d91
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 7 deletions.
8 changes: 7 additions & 1 deletion asv.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,11 @@
"environment_type": "virtualenv",
"show_commit_url": "http://github.com/kedro-org/kedro/commit/",
"results_dir": ".asv/results",
"html_dir": ".asv/html"
"html_dir": ".asv/html",
"matrix": {
"req": {
"kedro-datasets": [],
"pandas": []
}
}
}
82 changes: 82 additions & 0 deletions benchmarks/benchmark_datacatalog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import pandas as pd
from kedro_datasets.pandas import CSVDataset

from kedro.io import DataCatalog

base_catalog = {
f"dataset_{i}": {
"type": "pandas.CSVDataset",
"filepath": f"data_{i}.csv",
} for i in range(1, 1001)
}
# Add datasets with the same filepath for loading
base_catalog.update({
f"dataset_load_{i}": {
"type": "pandas.CSVDataset",
"filepath": "data.csv",
} for i in range(1, 1001)
})
# Add a factory pattern
base_catalog.update({
"dataset_factory_{placeholder}": {
"type": "pandas.CSVDataset",
"filepath": "data_{placeholder}.csv",
}
})

class TimeDataCatalog:
def setup(self):
self.catalog = DataCatalog.from_config(base_catalog)
self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
self.dataframe.to_csv("data.csv", index=False)
self.datasets = {
f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
}
self.feed_dict = {
f"param_{i}": i for i in range(1, 1001)
}

def time_init(self):
"""Benchmark the time to initialize the catalog"""
DataCatalog.from_config(base_catalog)

def time_save(self):
"""Benchmark the time to save datasets"""
for i in range(1,1001):
self.catalog.save(f"dataset_{i}", self.dataframe)

def time_load(self):
"""Benchmark the time to load datasets"""
for i in range(1,1001):
self.catalog.load(f"dataset_load_{i}")

def time_exists(self):
"""Benchmark the time to check if datasets exist"""
for i in range(1,1001):
self.catalog.exists(f"dataset_{i}")

def time_release(self):
"""Benchmark the time to release datasets"""
for i in range(1,1001):
self.catalog.release(f"dataset_{i}")

def time_add_all(self):
"""Benchmark the time to add all datasets"""
self.catalog.add_all(self.datasets)

def time_feed_dict(self):
"""Benchmark the time to add feed dict"""
self.catalog.add_feed_dict(self.feed_dict)

def time_list(self):
"""Benchmark the time to list all datasets"""
self.catalog.list()

def time_shallow_copy(self):
"""Benchmark the time to shallow copy the catalog"""
self.catalog.shallow_copy()

def time_resolve_factory(self):
"""Benchmark the time to resolve factory"""
for i in range(1,1001):
self.catalog._get_dataset(f"dataset_factory_{i}")
12 changes: 6 additions & 6 deletions benchmarks/benchmark_ocl.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False):
return globals_dict

def _create_config_file(conf_source, env, file_name, data):
env_path = conf_source / env
env_path.mkdir(parents=True, exist_ok=True)
file_path = env_path / file_name
env_path = conf_source / env
env_path.mkdir(parents=True, exist_ok=True)
file_path = env_path / file_name

import yaml
with open(file_path, "w") as f:
yaml.dump(data, f)
import yaml
with open(file_path, "w") as f:
yaml.dump(data, f)

base_catalog = _generate_catalog(1, 1000, is_versioned=True)
local_catalog = _generate_catalog(501, 1500, is_local=True)
Expand Down

0 comments on commit 8df7d91

Please sign in to comment.