Skip to content

Commit

Permalink
Add VoxCeleb2Scenario for audio identification (#3179)
Browse files Browse the repository at this point in the history
Co-authored-by: ImKeTT <[email protected]>
  • Loading branch information
LAOS-Y and ImKeTT authored Nov 23, 2024
1 parent aa3b31a commit 47dedc8
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 8 deletions.
1 change: 1 addition & 0 deletions src/helm/benchmark/presentation/run_entries_speech.conf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ entries: [
{description: "meld_audio:model=audiolm", priority: 1}
{description: "vocal_sound:model=audiolm", priority: 1}
{description: "audiocaps:model=audiolm", priority: 1}
{description: "voxceleb2:model=audiolm", priority: 1}

####################################################################################################################
# Fairness
Expand Down
19 changes: 19 additions & 0 deletions src/helm/benchmark/run_specs/audio_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,25 @@ def get_audiocaps_run_spec() -> RunSpec:
)


@run_spec_function("voxceleb2")
def get_voxceleb2_run_spec() -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.audio_language.voxceleb2_scenario.VoxCeleb2Scenario"
)
adapter_spec: AdapterSpec = _get_multiple_choice_joint_adapter_spec(
input_noun=None, output_noun="Answer", max_train_instances=0
)
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
run_spec_name: str = "voxceleb2"
return RunSpec(
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
groups=[run_spec_name],
)


@run_spec_function("common_voice_15")
def get_common_voice_15_run_spec(language: str) -> RunSpec:
scenario_spec = ScenarioSpec(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
from helm.common.audio_utils import use_ffmpeg_to_extract_audio_from_video


class CasualConversations2Scenario(Scenario):
Expand Down Expand Up @@ -72,12 +73,6 @@ def __init__(self, subject: str) -> None:
self.options = self.age_options if subject == "age" else self.gender_options
self.instruction = self.AGE_INSTRUCTION if subject == "age" else self.GENDER_INSTRUCTION

def _extract_audio_from_video(self, input_video_path: str, output_audio_path: str) -> None:
try:
os.system(f"ffmpeg -i {input_video_path} -q:a 0 -map a {output_audio_path}")
except Exception:
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")

def _convert_age_to_label(self, age: str) -> str:
if age != "prefer not to say":
age_int = int(age)
Expand Down Expand Up @@ -128,8 +123,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
if file_name.endswith(".mp4"):
local_audio_path: str = os.path.join(audio_file_folder, file_name.replace(".mp4", ".mp3"))
local_video_path: str = os.path.join(data_dir, file_name)
if not os.path.exists(local_audio_path):
self._extract_audio_from_video(local_video_path, local_audio_path)
use_ffmpeg_to_extract_audio_from_video(local_video_path, local_audio_path)
assert os.path.exists(local_audio_path), f"Audio file does not exist at path: {local_audio_path}"

subject_answer = audio_scripts[file_name][self._subject]
Expand Down
105 changes: 105 additions & 0 deletions src/helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from typing import List
import os

from helm.benchmark.scenarios.scenario import (
Scenario,
Instance,
Reference,
TEST_SPLIT,
CORRECT_TAG,
Input,
Output,
)
from tqdm import tqdm
from helm.common.media_object import MediaObject, MultimediaObject
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
import pandas as pd


class VoxCeleb2Scenario(Scenario):
"""VoxCeleb2
VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
interview videos uploaded to YouTube. This dataset contains over a million utterances from over
6,000 speakers.
Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
Citation:
@inproceedings{Chung18b,
author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
title = "VoxCeleb2: Deep Speaker Recognition",
booktitle = "INTERSPEECH",
year = "2018",
}
"""

DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
REFERENCE_URL = (
"https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
)
IDENTITY_INSTRUCTION = (
"Listen to the audio and take your best guess to determine if the two speakers are the same person."
)

name = "voxceleb2"
description = (
"A large-scale dataset of over a million utterances from over 6,000 speakers with their"
"gender, race, identity information"
"([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
)
tags: List[str] = ["audio", "identification"]
options: List[str] = ["Yes", "No"]

def _convert_answer_to_label(self, answer: bool) -> str:
if answer:
return "A"
else:
return "B"

def _reformat_and_convert_audio_file(
self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
) -> str:
tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
ensure_directory_exists(os.path.dirname(tgt_audio_path))
use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
return tgt_audio_path

def get_instances(self, output_path: str) -> List[Instance]:
instances: List[Instance] = []
audio_data_path = os.path.join(output_path, "audio_files")
tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
instances = []
for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
tgt_first_audio_path = self._reformat_and_convert_audio_file(
row["first"], tgt_audio_data_path, audio_data_path
)
tgt_second_audio_path = self._reformat_and_convert_audio_file(
row["second"], tgt_audio_data_path, audio_data_path
)

answer = self._convert_answer_to_label(row["same"])
# The given correct answer is a letter, but we need an index
correct_answer_index: int = ord(answer) - ord("A")
references: List[Reference] = []
for i, option in enumerate(self.options):
reference: Reference
is_correct: bool = i == correct_answer_index
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
references.append(reference)

input = Input(
multimedia_content=MultimediaObject(
[
MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
]
)
)
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))

return instances
21 changes: 21 additions & 0 deletions src/helm/benchmark/static/schema_speech.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,27 @@ run_groups:
when: "2019"
language: English

- name: voxceleb2
display_name: VoxCeleb2
description: >
VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from
interview videos uploaded to YouTube. It contains over a million utterances from over 6,000
speakers with their gender, race, identity information in 145 different nationalities, covering
a wide range of accents, ages, ethnicities and languages.
([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))
metric_groups:
- accuracy
- general_information
environment:
main_name: exact_match
main_split: test
taxonomy:
task: audio identification
what: audio clips in the wild
who: real speakers
when: "2018"
language: English, Germany, French

- name: common_voice_15
display_name: Common Voice 15
description: >
Expand Down
20 changes: 20 additions & 0 deletions src/helm/common/audio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import soundfile as sf
import subprocess

from helm.common.multimodal_request_utils import get_contents_as_bytes
from helm.common.optional_dependencies import handle_module_not_found_error
Expand Down Expand Up @@ -42,3 +43,22 @@ def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarr
# librosa accepts a local file path or a file-like object
audio_array, _ = librosa.load(audio_file, sr=sample_rate)
return audio_array


def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
if os.path.exists(output_path):
return
"""Use ffmpeg to convert an audio file type"""
try:
subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")


def use_ffmpeg_to_extract_audio_from_video(input_video_path: str, output_audio_path: str) -> None:
if os.path.exists(output_audio_path):
return
try:
subprocess.run(["ffmpeg", "-i", input_video_path, "-q:a", "0", "-map", "a", output_audio_path], check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to extract audio files.")

0 comments on commit 47dedc8

Please sign in to comment.