diff --git a/llama_stack/distribution/registry/scorers/__init__.py b/llama_stack/distribution/registry/scorers/__init__.py index 3332b70527..084a620a74 100644 --- a/llama_stack/distribution/registry/scorers/__init__.py +++ b/llama_stack/distribution/registry/scorers/__init__.py @@ -5,9 +5,19 @@ # the root directory of this source tree. # TODO: make these import config based from llama_stack.apis.evals import * # noqa: F403 +from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from ..registry import Registry class ScorerRegistry(Registry[BaseScorer]): _REGISTRY: Dict[str, BaseScorer] = {} + + +SCORER_REGISTRY = { + "accuracy": AccuracyScorer, + "random": RandomScorer, +} + +for k, v in SCORER_REGISTRY.items(): + ScorerRegistry.register(k, v) diff --git a/llama_stack/providers/impls/meta_reference/evals/evals.py b/llama_stack/providers/impls/meta_reference/evals/evals.py index 3ae988cbdc..1d703a27ce 100644 --- a/llama_stack/providers/impls/meta_reference/evals/evals.py +++ b/llama_stack/providers/impls/meta_reference/evals/evals.py @@ -53,6 +53,7 @@ async def run_eval_task( scoring_config=EvaluateScoringConfig( scorer_config_list=[ EvaluateSingleScorerConfig(scorer_name="accuracy"), + EvaluateSingleScorerConfig(scorer_name="random"), ] ), ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py new file mode 100644 index 0000000000..1a0621960e --- /dev/null +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/aggregate_scorer.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from llama_stack.apis.evals.evals import BaseScorer, EvalResult, SingleEvalResult +from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 + + +class AggregateScorer(BaseScorer[ScorerInputSample]): + def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): + self.scorers = scorers + + def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: + all_score_data = {} + for scorer in self.scorers: + score_data = scorer.score_sample(scorer_input_sample).score_data + for k, v in score_data.items(): + all_score_data[k] = v + + return SingleEvalResult( + score_data=all_score_data, + ) + + def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: + all_metrics = {} + + for scorer in self.scorers: + metrics = scorer.aggregate_results(eval_results).metrics + for k, v in metrics.items(): + all_metrics[f"{scorer.__class__.__name__}:{k}"] = v + + return EvalResult( + metrics=all_metrics, + ) diff --git a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py index 47d41c6d61..48d8caa3fa 100644 --- a/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py +++ b/llama_stack/providers/impls/meta_reference/evals/scorer/basic_scorers.py @@ -9,34 +9,6 @@ from llama_stack.apis.datasets.datasets import * # noqa: F401 F403 -class AggregateScorer(BaseScorer[ScorerInputSample]): - def __init__(self, scorers: List[BaseScorer[ScorerInputSample]]): - self.scorers = scorers - - def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: - all_score_data = {} - for scorer in self.scorers: - score_data = scorer.score_sample(scorer_input_sample).score_data - for k, v in score_data.items(): - all_score_data[k] = v - - return SingleEvalResult( - score_data=all_score_data, - ) - - def aggregate_results(self, eval_results: List[SingleEvalResult]) -> EvalResult: - all_metrics = {} - - for scorer in self.scorers: - metrics = scorer.aggregate_results(eval_results).metrics - for k, v in metrics.items(): - all_metrics[f"{scorer.__class__.__name__}:{k}"] = v - - return EvalResult( - metrics=all_metrics, - ) - - class RandomScorer(BaseScorer[ScorerInputSample]): def score_sample(self, scorer_input_sample: ScorerInputSample) -> SingleEvalResult: return SingleEvalResult(score_data={"random": random.random()}) diff --git a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py index fde2efdb08..48c4509141 100644 --- a/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py +++ b/llama_stack/providers/impls/meta_reference/evals/tasks/run_eval_task.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. from llama_stack.distribution.registry.datasets import DatasetRegistry +from llama_stack.distribution.registry.scorers import ScorerRegistry +from llama_stack.providers.impls.meta_reference.evals.scorer.aggregate_scorer import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.scorer.basic_scorers import * # noqa: F403 from llama_stack.providers.impls.meta_reference.evals.generator.inference_generator import ( InferenceGenerator, @@ -59,11 +61,14 @@ async def run( cprint(postprocessed, "blue") # F3 - scorer + scorer_config_list = eval_task_config.scoring_config.scorer_config_list + scorer_list = [] + for s_conf in scorer_config_list: + scorer = ScorerRegistry.get(s_conf.scorer_name) + scorer_list.append(scorer()) + scorer = AggregateScorer( - scorers=[ - AccuracyScorer(), - RandomScorer(), - ] + scorers=scorer_list, ) scorer_results = scorer.score(postprocessed)