fix(lcbench-tabular): Correctly preprocess data

automl · Oct 31, 2023 · 5784214 · 5784214
1 parent b59cd95
commit 5784214
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 15 deletions.
diff --git a/src/mfpbench/lcbench_tabular/benchmark.py b/src/mfpbench/lcbench_tabular/benchmark.py
@@ -18,7 +18,12 @@
 from mfpbench.tabular import TabularBenchmark
 
 
-def _get_raw_lcbench_space(name: str, seed: int | None = None) -> ConfigurationSpace:
+def _get_raw_lcbench_space(
+    name: str,
+    seed: int | None = None,
+    *,
+    with_constants: bool = False,
+) -> ConfigurationSpace:
     """Get the raw configuration space for lcbench tabular.
 
     !!! note
@@ -29,6 +34,7 @@ def _get_raw_lcbench_space(name: str, seed: int | None = None) -> ConfigurationS
     Args:
         name: The name for the space.
         seed: The seed to use.
+        with_constants: Whether to include constants or not
 
     Returns:
         The configuration space.
@@ -86,17 +92,23 @@ def _get_raw_lcbench_space(name: str, seed: int | None = None) -> ConfigurationS
                 log=False,
                 default_value=0.2,  # reasonable default
             ),
-            Constant("cosine_annealing_T_max", 50),
-            Constant("cosine_annealing_eta_min", 0.0),
-            Constant("normalization_strategy", "standardize"),
-            Constant("optimizer", "sgd"),
-            Constant("learning_rate_scheduler", "cosine_annealing"),
-            Constant("network", "shapedmlpnet"),
-            Constant("activation", "relu"),
-            Constant("mlp_shape", "funnel"),
-            Constant("imputation_strategy", "mean"),
         ],
     )
+
+    if with_constants:
+        cs.add_hyperparameters(
+            [
+                Constant("cosine_annealing_T_max", 50),
+                Constant("cosine_annealing_eta_min", 0.0),
+                Constant("normalization_strategy", "standardize"),
+                Constant("optimizer", "sgd"),
+                Constant("learning_rate_scheduler", "cosine_annealing"),
+                Constant("network", "shapedmlpnet"),
+                Constant("activation", "relu"),
+                Constant("mlp_shape", "funnel"),
+                Constant("imputation_strategy", "mean"),
+            ],
+        )
     return cs
 
 
@@ -221,7 +233,7 @@ def __init__(
         task_id: str,
         datadir: str | Path | None = None,
         *,
-        remove_constants: bool = False,
+        remove_constants: bool = True,
         seed: int | None = None,
         prior: str | Path | LCBenchTabularConfig | Mapping[str, Any] | None = None,
         perturb_prior: float | None = None,
@@ -274,12 +286,16 @@ def __init__(
         table = table.drop(index=drop_epoch, level="epoch")
 
         benchmark_task_name = f"lcbench_tabular-{task_id}"
-        space = _get_raw_lcbench_space(name=f"lcbench_tabular-{task_id}", seed=seed)
+        space = _get_raw_lcbench_space(
+            name=f"lcbench_tabular-{task_id}",
+            seed=seed,
+            with_constants=not remove_constants,
+        )
 
         super().__init__(
             table=table,  # type: ignore
             name=benchmark_task_name,
-            config_name="id",
+            config_name="config_id",
             fidelity_name=cls.fidelity_name,
             result_keys=LCBenchTabularResult.names(),
             config_keys=LCBenchTabularConfig.names(),

diff --git a/src/mfpbench/setup_benchmark.py b/src/mfpbench/setup_benchmark.py
@@ -240,7 +240,7 @@ def _process(cls, path: Path) -> None:
                 )
                 # These are single valued but this will make them as a list into
                 # the dataframe
-                df = df.assign(**{"id": config_id, **config})
+                df = df.assign(**{"config_id": config_id, **config})
 
                 config_frames_for_dataset.append(df)
 
@@ -249,7 +249,7 @@ def _process(cls, path: Path) -> None:
             df_for_dataset = (
                 pd.concat(config_frames_for_dataset, ignore_index=True)
                 .convert_dtypes()
-                .set_index(["id", "epoch"])
+                .set_index(["config_id", "epoch"])
                 .sort_index()
             )
             table_path = path / f"{dataset_name}.parquet"