Skip to content

Commit

Permalink
Merge pull request #16 from automl/taskset-tabular
Browse files Browse the repository at this point in the history
Adding taskset-tabular
  • Loading branch information
Neeratyoy authored Jan 20, 2024
2 parents 562aef2 + edc3c44 commit 650c80f
Show file tree
Hide file tree
Showing 11 changed files with 1,565 additions and 26 deletions.
10 changes: 10 additions & 0 deletions src/mfpbench/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def do(cls, args: argparse.Namespace) -> None:
download=True,
install=False,
force=args.force,
workers=args.workers,
)

@override
Expand All @@ -149,6 +150,15 @@ def fill_parser(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser
action="store_true",
help="Print out the available benchmarks data sources",
)
parser.add_argument(
"--workers",
type=int,
default=1,
help=(
"The number of workers to use for downloading"
" if the downlaoder supports it"
),
)
parser.add_argument(
"--benchmark",
choices=[
Expand Down
3 changes: 3 additions & 0 deletions src/mfpbench/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
MFHartmann6BenchmarkModerate,
MFHartmann6BenchmarkTerrible,
)
from mfpbench.taskset_tabular import TaskSetTabularBenchmark
from mfpbench.yahpo import (
IAMLglmnetBenchmark,
IAMLrangerBenchmark,
Expand Down Expand Up @@ -85,6 +86,8 @@
"lcbench_tabular": LCBenchTabularBenchmark,
# PD1Tabular
"pd1_tabular": PD1TabularBenchmark,
# TaskSetTabular
"taskset_tabular": TaskSetTabularBenchmark,
}


Expand Down
3 changes: 3 additions & 0 deletions src/mfpbench/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import dataclass, field

import numpy as np
import pandas as pd


class OutOfBoundsError(ValueError):
Expand Down Expand Up @@ -38,6 +39,8 @@ def as_value(self, value: float) -> Metric.Value:
Returns:
The metric value.
"""
if pd.isna(value):
value = np.inf
return Metric.Value(value=value, definition=self)

@property
Expand Down
8 changes: 4 additions & 4 deletions src/mfpbench/pd1/processing/process_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def is_large_num_steps():
if is_large_num_steps():
subsample_steps(df, path)
else:
df.set_index(["id", "epoch"], inplace=True)
df = df.set_index(["id", "epoch"])
# Save to disk
df.to_parquet(path.resolve().parent / f"{path.name.split('.csv')[0]}.parquet")

Expand All @@ -465,7 +465,7 @@ def subsample_steps(df: pd.DataFrame, path: Path) -> None:
path.resolve().parent / f"{path.name.split('.csv')[0]}-{jump_step}.parquet"
)
if jump_step == 1:
df.set_index(["id", "epoch"], inplace=True)
df = df.set_index(["id", "epoch"])
# Save to disk
df.to_parquet(target_path)
continue
Expand All @@ -484,10 +484,10 @@ def subsample_steps(df: pd.DataFrame, path: Path) -> None:
continue
drop_list = list(set(_unique_fids) - set(_retain_list))
df.loc[df["epoch"].isin(drop_list), "epoch"] = np.nan
df.dropna(inplace=True)
df = df.dropna()

# reindexing
df.set_index(["id", "epoch"], inplace=True)
df = df.set_index(["id", "epoch"])
# enumerating fidelities again
df.index = df.index.set_levels(
np.arange(1, len(df.index.get_level_values(1)) + 1, dtype=int).tolist(),
Expand Down
24 changes: 11 additions & 13 deletions src/mfpbench/pd1_tabular/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,19 @@
from pathlib import Path
from typing import Any, ClassVar, Mapping

import numpy as np
import pandas as pd
from ConfigSpace import ConfigurationSpace
from ConfigSpace.hyperparameters import (
Constant,
UniformFloatHyperparameter,
UniformIntegerHyperparameter,
)

from mfpbench.config import TabularConfig
from mfpbench.metric import Metric
from mfpbench.result import Result
from mfpbench.setup_benchmark import PD1TabularSource # TODO
from mfpbench.tabular import TabularBenchmark

from mfpbench.pd1.benchmark import (
PD1Config, PD1ResultSimple, PD1ResultTransformer, PD1Benchmark
PD1ResultSimple,
PD1ResultTransformer,
)
from mfpbench.setup_benchmark import PD1TabularSource # TODO
from mfpbench.tabular import TabularBenchmark


def _get_raw_pd1_space(
Expand All @@ -30,7 +25,6 @@ def _get_raw_pd1_space(
*,
with_constants: bool | None = None,
) -> ConfigurationSpace:

cs = ConfigurationSpace(name=name, seed=seed)
cs.add_hyperparameters(
[
Expand Down Expand Up @@ -146,18 +140,22 @@ def __init__(
if model not in cls.models:
raise ValueError(f"Unknown task {model}, must be one of {cls.models}")
if batch_size not in cls.batch_sizes:
raise ValueError(f"Unknown task {batch_size}, must be one of {cls.batch_sizes}")
raise ValueError(
f"Unknown task {batch_size}, must be one of {cls.batch_sizes}",
)

bench_name = f"{dataset}-{model}-{batch_size}_tabular"
if bench_name in cls.coarser_step_list:
assert coarseness in [1, 2, 5, 10], "Not a recognized coarseness!"
bench_name += f"-{coarseness}"
else:
assert coarseness is None, "Not a sub-sampled benchmark. Set `coarseness=None`!"
assert (
coarseness is None
), "Not a sub-sampled benchmark. Set `coarseness=None`!"

if datadir is None:
datadir = PD1TabularSource.default_location()

table_path = Path(datadir) / f"{bench_name}.parquet"
if not table_path.exists():
raise FileNotFoundError(
Expand Down
40 changes: 34 additions & 6 deletions src/mfpbench/setup_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,23 @@ class BenchmarkSetup(ABC):
name: ClassVar[str]
"""The name of the benchmark group."""

supports_parallel: ClassVar[bool] = False
"""Whether this benchmark supports parallel downloading.
The download method will be called with a `workers` argument.
"""

@classmethod
@abstractmethod
def download(cls, path: Path) -> None:
def download(cls, path: Path, workers: int = 1) -> None:
"""Download the data from the source.
Args:
path: The root path to download to.
Will install to
path/[name][mfpbench.setup_benchmark.BenchmarkSetup.name]
workers: The number of workers to use for downloading. This
can be ignored for benchmarks that do not support parallel.
"""
...

Expand Down Expand Up @@ -97,7 +105,7 @@ class YAHPOSource(BenchmarkSetup):

@override
@classmethod
def download(cls, path: Path) -> None:
def download(cls, path: Path, workers: int = 1) -> None:
cmd = f"git clone --depth 1 --branch {cls.tag} {cls.git_url} {path}"
subprocess.run(cmd.split(), check=True) # noqa: S603

Expand Down Expand Up @@ -135,7 +143,7 @@ class PD1Source(BenchmarkSetup):

@override
@classmethod
def download(cls, path: Path) -> None:
def download(cls, path: Path, workers: int = 1) -> None:
cls._download_surrogates(path)

@classmethod
Expand Down Expand Up @@ -182,7 +190,7 @@ class LCBenchTabularSource(BenchmarkSetup):

@override
@classmethod
def download(cls, path: Path) -> None:
def download(cls, path: Path, workers: int = 1) -> None:
zippath = path / "data_2k.zip"
if not zippath.exists():
_urlopen = urllib.request.urlopen
Expand Down Expand Up @@ -263,7 +271,7 @@ class PD1TabularSource(BenchmarkSetup):

@override
@classmethod
def download(cls, path: Path) -> None:
def download(cls, path: Path, workers: int = 1) -> None:
zippath = path / "pd1.tar.gz"
if not zippath.exists():
_urlopen = urllib.request.urlopen
Expand All @@ -280,6 +288,22 @@ def _process(cls, path: Path) -> None:
process_pd1(path, process_tabular=True)


class TaskSetabularSource(BenchmarkSetup):
name = "taskset-tabular"
supports_parallel = True

@override
@classmethod
def download(cls, path: Path, workers: int = 1) -> None:
cls._process(path, workers=workers)

@classmethod
def _process(cls, path: Path, workers: int = 1) -> None:
from mfpbench.taskset_tabular.processing.process import process_taskset

process_taskset(output_dir=path, workers=workers)


def download_status(source: str, datadir: Path | None = None) -> bool:
"""Check whether the data is downloaded for some source."""
datadir = datadir if datadir is not None else DATAROOT
Expand Down Expand Up @@ -360,6 +384,7 @@ def setup(
download: bool = True,
install: str | bool = False,
force: bool = False,
workers: int = 1,
) -> None:
"""Download data for a benchmark.
Expand All @@ -371,6 +396,9 @@ def setup(
If True, will install the default. If a str, tries to interpret
it as a full path.
force: Whether to force redownload of the data
workers: The number of workers to use for downloading. This
will be ignored for benchmarks that do not support parallel
setup.
"""
datadir = datadir if datadir is not None else DATAROOT

Expand All @@ -385,7 +413,7 @@ def setup(
if not source_path.exists() or next(source_path.iterdir(), None) is None:
print(f"Downloading to {source_path}")
source_path.mkdir(exist_ok=True, parents=True)
source.download(source_path)
source.download(source_path, workers=workers)
print(f"Finished downloading to {source_path}")
else:
print(f"Already found something at {source_path}")
Expand Down
6 changes: 3 additions & 3 deletions src/mfpbench/tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ def __init__( # noqa: PLR0913, C901
table = table[relevant_cols] # type: ignore
table = table.set_index(index_cols).sort_index()
# MARK: put this back in after testing
#table.index = table.index.set_levels(
#[table.index.levels[0].astype(int), table.index.levels[1].astype(int)],
#)
# table.index = table.index.set_levels(
# [table.index.levels[0].astype(int), table.index.levels[1].astype(int)],
# )

# We now have the following table
#
Expand Down
11 changes: 11 additions & 0 deletions src/mfpbench/taskset_tabular/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from __future__ import annotations

from mfpbench.taskset_tabular.benchmark import (
TaskSetTabularBenchmark,
TaskSetTabularResult,
)

__all__ = [
"TaskSetTabularBenchmark",
"TaskSetTabularResult",
]
Loading

0 comments on commit 650c80f

Please sign in to comment.