Merge pull request #16 from automl/taskset-tabular

Adding taskset-tabular
automl · Jan 20, 2024 · 650c80f · 650c80f
2 parents 562aef2 + edc3c44
commit 650c80f
Show file tree

Hide file tree

Showing 11 changed files with 1,565 additions and 26 deletions.
diff --git a/src/mfpbench/__main__.py b/src/mfpbench/__main__.py
@@ -129,6 +129,7 @@ def do(cls, args: argparse.Namespace) -> None:
             download=True,
             install=False,
             force=args.force,
+            workers=args.workers,
         )
 
     @override
@@ -149,6 +150,15 @@ def fill_parser(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser
             action="store_true",
             help="Print out the available benchmarks data sources",
         )
+        parser.add_argument(
+            "--workers",
+            type=int,
+            default=1,
+            help=(
+                "The number of workers to use for downloading"
+                " if the downlaoder supports it"
+            ),
+        )
         parser.add_argument(
             "--benchmark",
             choices=[

diff --git a/src/mfpbench/get.py b/src/mfpbench/get.py
@@ -25,6 +25,7 @@
     MFHartmann6BenchmarkModerate,
     MFHartmann6BenchmarkTerrible,
 )
+from mfpbench.taskset_tabular import TaskSetTabularBenchmark
 from mfpbench.yahpo import (
     IAMLglmnetBenchmark,
     IAMLrangerBenchmark,
@@ -85,6 +86,8 @@
     "lcbench_tabular": LCBenchTabularBenchmark,
     # PD1Tabular
     "pd1_tabular": PD1TabularBenchmark,
+    # TaskSetTabular
+    "taskset_tabular": TaskSetTabularBenchmark,
 }
 
 

diff --git a/src/mfpbench/metric.py b/src/mfpbench/metric.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass, field
 
 import numpy as np
+import pandas as pd
 
 
 class OutOfBoundsError(ValueError):
@@ -38,6 +39,8 @@ def as_value(self, value: float) -> Metric.Value:
         Returns:
             The metric value.
         """
+        if pd.isna(value):
+            value = np.inf
         return Metric.Value(value=value, definition=self)
 
     @property

diff --git a/src/mfpbench/pd1/processing/process_script.py b/src/mfpbench/pd1/processing/process_script.py
@@ -449,7 +449,7 @@ def is_large_num_steps():
     if is_large_num_steps():
         subsample_steps(df, path)
     else:
-        df.set_index(["id", "epoch"], inplace=True)
+        df = df.set_index(["id", "epoch"])
         # Save to disk
         df.to_parquet(path.resolve().parent / f"{path.name.split('.csv')[0]}.parquet")
 
@@ -465,7 +465,7 @@ def subsample_steps(df: pd.DataFrame, path: Path) -> None:
             path.resolve().parent / f"{path.name.split('.csv')[0]}-{jump_step}.parquet"
         )
         if jump_step == 1:
-            df.set_index(["id", "epoch"], inplace=True)
+            df = df.set_index(["id", "epoch"])
             # Save to disk
             df.to_parquet(target_path)
             continue
@@ -484,10 +484,10 @@ def subsample_steps(df: pd.DataFrame, path: Path) -> None:
             continue
         drop_list = list(set(_unique_fids) - set(_retain_list))
         df.loc[df["epoch"].isin(drop_list), "epoch"] = np.nan
-        df.dropna(inplace=True)
+        df = df.dropna()
 
         # reindexing
-        df.set_index(["id", "epoch"], inplace=True)
+        df = df.set_index(["id", "epoch"])
         # enumerating fidelities again
         df.index = df.index.set_levels(
             np.arange(1, len(df.index.get_level_values(1)) + 1, dtype=int).tolist(),

diff --git a/src/mfpbench/pd1_tabular/benchmark.py b/src/mfpbench/pd1_tabular/benchmark.py
@@ -4,24 +4,19 @@
 from pathlib import Path
 from typing import Any, ClassVar, Mapping
 
-import numpy as np
 import pandas as pd
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace.hyperparameters import (
-    Constant,
     UniformFloatHyperparameter,
-    UniformIntegerHyperparameter,
 )
 
 from mfpbench.config import TabularConfig
-from mfpbench.metric import Metric
-from mfpbench.result import Result
-from mfpbench.setup_benchmark import PD1TabularSource  # TODO
-from mfpbench.tabular import TabularBenchmark
-
 from mfpbench.pd1.benchmark import (
-    PD1Config, PD1ResultSimple, PD1ResultTransformer, PD1Benchmark
+    PD1ResultSimple,
+    PD1ResultTransformer,
 )
+from mfpbench.setup_benchmark import PD1TabularSource  # TODO
+from mfpbench.tabular import TabularBenchmark
 
 
 def _get_raw_pd1_space(
@@ -30,7 +25,6 @@ def _get_raw_pd1_space(
     *,
     with_constants: bool | None = None,
 ) -> ConfigurationSpace:
-
     cs = ConfigurationSpace(name=name, seed=seed)
     cs.add_hyperparameters(
         [
@@ -146,18 +140,22 @@ def __init__(
         if model not in cls.models:
             raise ValueError(f"Unknown task {model}, must be one of {cls.models}")
         if batch_size not in cls.batch_sizes:
-            raise ValueError(f"Unknown task {batch_size}, must be one of {cls.batch_sizes}")
+            raise ValueError(
+                f"Unknown task {batch_size}, must be one of {cls.batch_sizes}",
+            )
 
         bench_name = f"{dataset}-{model}-{batch_size}_tabular"
         if bench_name in cls.coarser_step_list:
             assert coarseness in [1, 2, 5, 10], "Not a recognized coarseness!"
             bench_name += f"-{coarseness}"
         else:
-            assert coarseness is None, "Not a sub-sampled benchmark. Set `coarseness=None`!"
+            assert (
+                coarseness is None
+            ), "Not a sub-sampled benchmark. Set `coarseness=None`!"
 
         if datadir is None:
             datadir = PD1TabularSource.default_location()
-        
+
         table_path = Path(datadir) / f"{bench_name}.parquet"
         if not table_path.exists():
             raise FileNotFoundError(

diff --git a/src/mfpbench/setup_benchmark.py b/src/mfpbench/setup_benchmark.py
@@ -29,15 +29,23 @@ class BenchmarkSetup(ABC):
     name: ClassVar[str]
     """The name of the benchmark group."""
 
+    supports_parallel: ClassVar[bool] = False
+    """Whether this benchmark supports parallel downloading.
+
+    The download method will be called with a `workers` argument.
+    """
+
     @classmethod
     @abstractmethod
-    def download(cls, path: Path) -> None:
+    def download(cls, path: Path, workers: int = 1) -> None:
         """Download the data from the source.
 
         Args:
             path: The root path to download to.
                 Will install to
                 path/[name][mfpbench.setup_benchmark.BenchmarkSetup.name]
+            workers: The number of workers to use for downloading. This
+                can be ignored for benchmarks that do not support parallel.
         """
         ...
 
@@ -97,7 +105,7 @@ class YAHPOSource(BenchmarkSetup):
 
     @override
     @classmethod
-    def download(cls, path: Path) -> None:
+    def download(cls, path: Path, workers: int = 1) -> None:
         cmd = f"git clone --depth 1 --branch {cls.tag} {cls.git_url} {path}"
         subprocess.run(cmd.split(), check=True)  # noqa: S603
 
@@ -135,7 +143,7 @@ class PD1Source(BenchmarkSetup):
 
     @override
     @classmethod
-    def download(cls, path: Path) -> None:
+    def download(cls, path: Path, workers: int = 1) -> None:
         cls._download_surrogates(path)
 
     @classmethod
@@ -182,7 +190,7 @@ class LCBenchTabularSource(BenchmarkSetup):
 
     @override
     @classmethod
-    def download(cls, path: Path) -> None:
+    def download(cls, path: Path, workers: int = 1) -> None:
         zippath = path / "data_2k.zip"
         if not zippath.exists():
             _urlopen = urllib.request.urlopen
@@ -263,7 +271,7 @@ class PD1TabularSource(BenchmarkSetup):
 
     @override
     @classmethod
-    def download(cls, path: Path) -> None:
+    def download(cls, path: Path, workers: int = 1) -> None:
         zippath = path / "pd1.tar.gz"
         if not zippath.exists():
             _urlopen = urllib.request.urlopen
@@ -280,6 +288,22 @@ def _process(cls, path: Path) -> None:
         process_pd1(path, process_tabular=True)
 
 
+class TaskSetabularSource(BenchmarkSetup):
+    name = "taskset-tabular"
+    supports_parallel = True
+
+    @override
+    @classmethod
+    def download(cls, path: Path, workers: int = 1) -> None:
+        cls._process(path, workers=workers)
+
+    @classmethod
+    def _process(cls, path: Path, workers: int = 1) -> None:
+        from mfpbench.taskset_tabular.processing.process import process_taskset
+
+        process_taskset(output_dir=path, workers=workers)
+
+
 def download_status(source: str, datadir: Path | None = None) -> bool:
     """Check whether the data is downloaded for some source."""
     datadir = datadir if datadir is not None else DATAROOT
@@ -360,6 +384,7 @@ def setup(
     download: bool = True,
     install: str | bool = False,
     force: bool = False,
+    workers: int = 1,
 ) -> None:
     """Download data for a benchmark.
 
@@ -371,6 +396,9 @@ def setup(
             If True, will install the default. If a str, tries to interpret
             it as a full path.
         force: Whether to force redownload of the data
+        workers: The number of workers to use for downloading. This
+            will be ignored for benchmarks that do not support parallel
+            setup.
     """
     datadir = datadir if datadir is not None else DATAROOT
 
@@ -385,7 +413,7 @@ def setup(
         if not source_path.exists() or next(source_path.iterdir(), None) is None:
             print(f"Downloading to {source_path}")
             source_path.mkdir(exist_ok=True, parents=True)
-            source.download(source_path)
+            source.download(source_path, workers=workers)
             print(f"Finished downloading to {source_path}")
         else:
             print(f"Already found something at {source_path}")

diff --git a/src/mfpbench/tabular.py b/src/mfpbench/tabular.py
@@ -125,9 +125,9 @@ def __init__(  # noqa: PLR0913, C901
         table = table[relevant_cols]  # type: ignore
         table = table.set_index(index_cols).sort_index()
         # MARK: put this back in after testing
-        #table.index = table.index.set_levels(
-            #[table.index.levels[0].astype(int), table.index.levels[1].astype(int)],
-        #)
+        # table.index = table.index.set_levels(
+        # [table.index.levels[0].astype(int), table.index.levels[1].astype(int)],
+        # )
 
         # We now have the following table
         #

diff --git a/src/mfpbench/taskset_tabular/__init__.py b/src/mfpbench/taskset_tabular/__init__.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from mfpbench.taskset_tabular.benchmark import (
+    TaskSetTabularBenchmark,
+    TaskSetTabularResult,
+)
+
+__all__ = [
+    "TaskSetTabularBenchmark",
+    "TaskSetTabularResult",
+]