Merge branch 'main' into feature/4175-catalog-dict-interface

kedro-org · Oct 18, 2024 · 8df7d91 · 8df7d91
2 parents 6650a83 + 2e950a2
commit 8df7d91
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 7 deletions.
diff --git a/asv.conf.json b/asv.conf.json
@@ -8,5 +8,11 @@
     "environment_type": "virtualenv",
     "show_commit_url": "http://github.com/kedro-org/kedro/commit/",
     "results_dir": ".asv/results",
-    "html_dir": ".asv/html"
+    "html_dir": ".asv/html",
+    "matrix": {
+        "req": {
+            "kedro-datasets": [],
+            "pandas": []
+        }
+    }
 }
diff --git a/benchmarks/benchmark_datacatalog.py b/benchmarks/benchmark_datacatalog.py
@@ -0,0 +1,82 @@
+import pandas as pd
+from kedro_datasets.pandas import CSVDataset
+
+from kedro.io import DataCatalog
+
+base_catalog = {
+    f"dataset_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": f"data_{i}.csv",
+    } for i in range(1, 1001)
+}
+# Add datasets with the same filepath for loading
+base_catalog.update({
+    f"dataset_load_{i}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data.csv",
+    } for i in range(1, 1001)
+})
+# Add a factory pattern
+base_catalog.update({
+    "dataset_factory_{placeholder}": {
+        "type": "pandas.CSVDataset",
+        "filepath": "data_{placeholder}.csv",
+    }
+})
+
+class TimeDataCatalog:
+    def setup(self):
+        self.catalog = DataCatalog.from_config(base_catalog)
+        self.dataframe = pd.DataFrame({"column": [1, 2, 3]})
+        self.dataframe.to_csv("data.csv", index=False)
+        self.datasets = {
+            f"dataset_new_{i}": CSVDataset(filepath="data.csv") for i in range(1, 1001)
+        }
+        self.feed_dict = {
+            f"param_{i}": i for i in range(1, 1001)
+        }
+
+    def time_init(self):
+        """Benchmark the time to initialize the catalog"""
+        DataCatalog.from_config(base_catalog)
+
+    def time_save(self):
+        """Benchmark the time to save datasets"""
+        for i in range(1,1001):
+            self.catalog.save(f"dataset_{i}", self.dataframe)
+
+    def time_load(self):
+        """Benchmark the time to load datasets"""
+        for i in range(1,1001):
+            self.catalog.load(f"dataset_load_{i}")
+
+    def time_exists(self):
+        """Benchmark the time to check if datasets exist"""
+        for i in range(1,1001):
+            self.catalog.exists(f"dataset_{i}")
+
+    def time_release(self):
+        """Benchmark the time to release datasets"""
+        for i in range(1,1001):
+            self.catalog.release(f"dataset_{i}")
+
+    def time_add_all(self):
+        """Benchmark the time to add all datasets"""
+        self.catalog.add_all(self.datasets)
+
+    def time_feed_dict(self):
+        """Benchmark the time to add feed dict"""
+        self.catalog.add_feed_dict(self.feed_dict)
+
+    def time_list(self):
+        """Benchmark the time to list all datasets"""
+        self.catalog.list()
+
+    def time_shallow_copy(self):
+        """Benchmark the time to shallow copy the catalog"""
+        self.catalog.shallow_copy()
+
+    def time_resolve_factory(self):
+        """Benchmark the time to resolve factory"""
+        for i in range(1,1001):
+            self.catalog._get_dataset(f"dataset_factory_{i}")
diff --git a/benchmarks/benchmark_ocl.py b/benchmarks/benchmark_ocl.py
@@ -33,13 +33,13 @@ def _generate_globals(start_range, end_range, is_local=False):
     return globals_dict
 
 def _create_config_file(conf_source, env, file_name, data):
-        env_path = conf_source / env
-        env_path.mkdir(parents=True, exist_ok=True)
-        file_path = env_path / file_name
+    env_path = conf_source / env
+    env_path.mkdir(parents=True, exist_ok=True)
+    file_path = env_path / file_name
 
-        import yaml
-        with open(file_path, "w") as f:
-            yaml.dump(data, f)
+    import yaml
+    with open(file_path, "w") as f:
+        yaml.dump(data, f)
 
 base_catalog = _generate_catalog(1, 1000, is_versioned=True)
 local_catalog = _generate_catalog(501, 1500, is_local=True)