Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GigaAM NeMo transducer model for Russian ASR #1467

Merged
merged 7 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions .github/workflows/export-nemo-giga-am-to-onnx.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
mkdir $d/test_wavs
rm scripts/nemo/GigaAM/model.onnx
mv -v scripts/nemo/GigaAM/*.int8.onnx $d/
mv -v scripts/nemo/GigaAM/*.md $d/
cp -v scripts/nemo/GigaAM/*.md $d/
mv -v scripts/nemo/GigaAM/*.pdf $d/
mv -v scripts/nemo/GigaAM/tokens.txt $d/
mv -v scripts/nemo/GigaAM/*.wav $d/test_wavs/
Expand All @@ -51,6 +51,34 @@ jobs:

tar cjvf ${d}.tar.bz2 $d

- name: Run Transducer
shell: bash
run: |
pushd scripts/nemo/GigaAM
./run-rnnt.sh
popd

d=sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24
mkdir $d
mkdir $d/test_wavs

mv -v scripts/nemo/GigaAM/encoder.int8.onnx $d/
mv -v scripts/nemo/GigaAM/decoder.onnx $d/
mv -v scripts/nemo/GigaAM/joiner.onnx $d/

cp -v scripts/nemo/GigaAM/*.md $d/
mv -v scripts/nemo/GigaAM/*.pdf $d/
mv -v scripts/nemo/GigaAM/tokens.txt $d/
mv -v scripts/nemo/GigaAM/*.wav $d/test_wavs/
mv -v scripts/nemo/GigaAM/run-rnnt.sh $d/
mv -v scripts/nemo/GigaAM/*-rnnt.py $d/

ls -lh scripts/nemo/GigaAM/

ls -lh $d

tar cjvf ${d}.tar.bz2 $d

- name: Release
uses: svenstaro/upload-release-action@v2
with:
Expand All @@ -61,7 +89,7 @@ jobs:
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: asr-models

- name: Publish to huggingface (CTC)
- name: Publish to huggingface (Transducer)
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
Expand All @@ -73,7 +101,7 @@ jobs:
git config --global user.email "[email protected]"
git config --global user.name "Fangjun Kuang"

d=sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24
d=sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24/
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:[email protected]/csukuangfj/$d huggingface
Expand Down
18 changes: 18 additions & 0 deletions scripts/apk/generate-vad-asr-apk-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,24 @@ def get_models():

ls -lh

popd
""",
),
Model(
model_name="sherpa-onnx-nemo-transducer-giga-am-russian-2024-10-24",
idx=20,
lang="ru",
short_name="nemo_transducer_giga_am",
cmd="""
pushd $model_name

rm -rfv test_wavs

rm -fv *.sh
rm -fv *.py

ls -lh

popd
""",
),
Expand Down
1 change: 1 addition & 0 deletions scripts/nemo/GigaAM/export-onnx-ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]):
onnx.save(model, filename)


@torch.no_grad()
def main():
model = EncDecCTCModel.from_config_file("./ctc_model_config.yaml")
ckpt = torch.load("./ctc_model_weights.ckpt", map_location="cpu")
Expand Down
119 changes: 119 additions & 0 deletions scripts/nemo/GigaAM/export-onnx-rnnt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)

from typing import Dict

import onnx
import torch
import torchaudio
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.modules.audio_preprocessing import (
AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor,
)
from nemo.collections.asr.parts.preprocessing.features import (
FilterbankFeaturesTA as NeMoFilterbankFeaturesTA,
)
from onnxruntime.quantization import QuantType, quantize_dynamic


def add_meta_data(filename: str, meta_data: Dict[str, str]):
"""Add meta data to an ONNX model. It is changed in-place.

Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model = onnx.load(filename)
while len(model.metadata_props):
model.metadata_props.pop()

for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)

onnx.save(model, filename)


class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
def __init__(self, mel_scale: str = "htk", wkwargs=None, **kwargs):
if "window_size" in kwargs:
del kwargs["window_size"]
if "window_stride" in kwargs:
del kwargs["window_stride"]

super().__init__(**kwargs)

self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = (
torchaudio.transforms.MelSpectrogram(
sample_rate=self._sample_rate,
win_length=self.win_length,
hop_length=self.hop_length,
n_mels=kwargs["nfilt"],
window_fn=self.torch_windows[kwargs["window"]],
mel_scale=mel_scale,
norm=kwargs["mel_norm"],
n_fft=kwargs["n_fft"],
f_max=kwargs.get("highfreq", None),
f_min=kwargs.get("lowfreq", 0),
wkwargs=wkwargs,
)
)


class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
def __init__(self, mel_scale: str = "htk", **kwargs):
super().__init__(**kwargs)
kwargs["nfilt"] = kwargs["features"]
del kwargs["features"]
self.featurizer = (
FilterbankFeaturesTA( # Deprecated arguments; kept for config compatibility
mel_scale=mel_scale,
**kwargs,
)
)


@torch.no_grad()
def main():
model = EncDecRNNTBPEModel.from_config_file("./rnnt_model_config.yaml")
ckpt = torch.load("./rnnt_model_weights.ckpt", map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model.eval()

with open("./tokens.txt", "w", encoding="utf-8") as f:
for i, s in enumerate(model.joint.vocabulary):
f.write(f"{s} {i}\n")
f.write(f"<blk> {i+1}\n")
print("Saved to tokens.txt")

model.encoder.export("encoder.onnx")
model.decoder.export("decoder.onnx")
model.joint.export("joiner.onnx")

meta_data = {
"vocab_size": model.decoder.vocab_size, # not including the blank
"pred_rnn_layers": model.decoder.pred_rnn_layers,
"pred_hidden": model.decoder.pred_hidden,
"normalize_type": "",
"subsampling_factor": 4,
"model_type": "EncDecRNNTBPEModel",
"version": "1",
"model_author": "https://github.com/salute-developers/GigaAM",
"license": "https://github.com/salute-developers/GigaAM/blob/main/GigaAM%20License_NC.pdf",
"language": "Russian",
"is_giga_am": 1,
}
add_meta_data("encoder.onnx", meta_data)

quantize_dynamic(
model_input="encoder.onnx",
model_output="encoder.int8.onnx",
weight_type=QuantType.QUInt8,
)


if __name__ == "__main__":
main()
14 changes: 9 additions & 5 deletions scripts/nemo/GigaAM/run-ctc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ function install_nemo() {
}

function download_files() {
curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM%20License_NC.pdf
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_weights.ckpt
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_config.yaml
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/long_example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/GigaAM%20License_NC.pdf
}

install_nemo
Expand Down
50 changes: 50 additions & 0 deletions scripts/nemo/GigaAM/run-rnnt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)

set -ex

function install_nemo() {
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
python3 get-pip.py

pip install torch==2.4.0 torchaudio==2.4.0 -f https://download.pytorch.org/whl/torch_stable.html

pip install -qq wget text-unidecode matplotlib>=3.3.2 onnx onnxruntime pybind11 Cython einops kaldi-native-fbank soundfile librosa
pip install -qq ipython

# sudo apt-get install -q -y sox libsndfile1 ffmpeg python3-pip ipython

BRANCH='main'
python3 -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]

pip install numpy==1.26.4
}

function download_files() {
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/rnnt_model_weights.ckpt
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/rnnt_model_config.yaml
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/tokenizer_all_sets.tar

curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/rnnt/rnnt_model_weights.ckpt
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/rnnt/rnnt_model_config.yaml
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/long_example.wav
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/GigaAM%20License_NC.pdf
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/rnnt/tokenizer_all_sets.tar
tar -xf tokenizer_all_sets.tar && rm tokenizer_all_sets.tar
ls -lh
echo "---"
ls -lh tokenizer_all_sets
echo "---"
}

install_nemo
download_files

python3 ./export-onnx-rnnt.py
ls -lh
python3 ./test-onnx-rnnt.py
rm -v encoder.onnx
ls -lh
Loading
Loading