Skip to content

Commit

Permalink
New arg for quasi-exact match (#3257)
Browse files Browse the repository at this point in the history
  • Loading branch information
teetone authored Jan 8, 2025
1 parent 1c8d873 commit 49cd8ef
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 28 deletions.
19 changes: 17 additions & 2 deletions src/helm/benchmark/metrics/evaluate_reference_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def pass_at_k_estimator(n: int, c: int, k: int) -> float:
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))


def normalize_text(text: str) -> str:
def normalize_text(text: str, should_remove_articles: bool = True) -> str:
"""Lower text and remove punctuation, articles and extra whitespace.
Copied from the [QuAC](http://quac.ai/) evaluation script found at
https://s3.amazonaws.com/my89public/quac/scorer.py"""
Expand All @@ -57,7 +57,10 @@ def remove_punc(text: str) -> str:
def lower(text: str) -> str:
return text.lower()

return white_space_fix(remove_articles(remove_punc(lower(text))))
normalized_text = remove_punc(lower(text))
if should_remove_articles:
normalized_text = remove_articles(normalized_text)
return white_space_fix(normalized_text)


def exact_match(gold: str, pred: str) -> float:
Expand All @@ -74,6 +77,17 @@ def quasi_exact_match(gold: str, pred: str) -> float:
return 1 if normalize_text(gold) == normalize_text(pred) else 0


def quasi_leave_articles_exact_match(gold: str, pred: str) -> float:
if not pred:
return 0

return (
1
if normalize_text(gold, should_remove_articles=False) == normalize_text(pred, should_remove_articles=False)
else 0
)


def prefix_exact_match(gold: str, pred: str) -> float:
"""
The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
Expand Down Expand Up @@ -423,6 +437,7 @@ def compute_metrics_helper(
metric_fn_mapping: Dict[str, Callable] = {
"exact_match": exact_match,
"quasi_exact_match": quasi_exact_match,
"quasi_leave_articles_exact_match": quasi_leave_articles_exact_match,
"prefix_exact_match": prefix_exact_match,
"quasi_prefix_exact_match": quasi_prefix_exact_match,
"exact_match_indicator": exact_match_indicator,
Expand Down
25 changes: 1 addition & 24 deletions src/helm/benchmark/presentation/run_entries_vhelm_debug.conf
Original file line number Diff line number Diff line change
@@ -1,26 +1,3 @@
entries: [

{description: "mm_star:category=coarse_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
{description: "mm_star:category=fine-grained_perception,model=vlm", priority: 1, groups: ["mm_star_perception"]}
{description: "mm_star:category=instance_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=logical_reasoning,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=math,model=vlm", priority: 1, groups: ["mm_star_reasoning"]}
{description: "mm_star:category=science_technology,model=vlm", priority: 1, groups: ["mm_star_knowledge"]}

{description: "blink:category=Art_Style,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Counting,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Object_Localization,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Depth,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Relative_Reflectance,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Semantic_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Spatial_Relation,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Correspondence,model=vlm", priority: 1, groups: ["blink_perception"]}
{description: "blink:category=Visual_Similarity,model=vlm", priority: 1, groups: ["blink_perception"]}

{description: "blink:category=Functional_Correspondence,model=vlm", priority: 1, groups: ["blink_knowledge"]}
{description: "blink:category=Forensic_Detection,model=vlm", priority: 1, groups: ["blink_knowledge"]}

{description: "blink:category=IQ_Test,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Jigsaw,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "blink:category=Multi-view_Reasoning,model=vlm", priority: 1, groups: ["blink_reasoning"]}
{description: "real_world_qa:model=vlm", priority: 1}
]
11 changes: 10 additions & 1 deletion src/helm/benchmark/run_specs/vlm_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,16 @@ def _get_multiple_choice_joint_adapter_spec(

def _get_open_ended_generation_metric_specs() -> List[MetricSpec]:
return get_basic_metric_specs(
["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4", "cider"]
[
"exact_match",
"quasi_exact_match",
"quasi_leave_articles_exact_match",
"f1_score",
"rouge_l",
"bleu_1",
"bleu_4",
"cider",
]
)


Expand Down
7 changes: 6 additions & 1 deletion src/helm/benchmark/static/schema_vhelm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ metrics:
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
lower_is_better: false
- name: quasi_leave_articles_exact_match
display_name: Quasi-exact match
short_display_name: EM
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
lower_is_better: false
- name: prefix_exact_match
display_name: Prefix exact match
short_display_name: PEM
Expand Down Expand Up @@ -902,7 +907,7 @@ run_groups:
- accuracy
- general_information
environment:
main_name: quasi_prefix_exact_match
main_name: quasi_leave_articles_exact_match
main_split: test
taxonomy:
task: short-answer question answering
Expand Down

0 comments on commit 49cd8ef

Please sign in to comment.