diff --git a/.github/workflows/mobile-asr-models.yaml b/.github/workflows/mobile-asr-models.yaml index c58bb0396..aa2f03eff 100644 --- a/.github/workflows/mobile-asr-models.yaml +++ b/.github/workflows/mobile-asr-models.yaml @@ -7,7 +7,6 @@ on: workflow_dispatch: - concurrency: group: mobile-asr-models-${{ github.ref }} cancel-in-progress: true @@ -16,11 +15,14 @@ jobs: mobile-asr-models: if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' || github.repository_owner == 'csu-fangjun' runs-on: ${{ matrix.os }} + name: ${{ matrix.index }}/${{ matrix.total }} strategy: fail-fast: false matrix: os: [ubuntu-latest] python-version: ["3.8"] + total: ["11"] + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"] steps: - uses: actions/checkout@v4 @@ -33,7 +35,20 @@ jobs: - name: Install dependencies shell: bash run: | - python3 -m pip install onnxruntime==1.16.3 onnx==1.15.0 + python3 -m pip install onnxruntime==1.16.3 onnx==1.15.0 jinja2 + + - name: Generate build script + shell: bash + run: | + cd scripts/mobile-asr-models + + total=${{ matrix.total }} + index=${{ matrix.index }} + + ./generate-asr.py --total $total --index $index + chmod +x run2.sh + mv run2.sh run.sh + ls -lh - name: Run shell: bash diff --git a/.github/workflows/mobile-kws-models.yaml b/.github/workflows/mobile-kws-models.yaml new file mode 100644 index 000000000..b7ccda052 --- /dev/null +++ b/.github/workflows/mobile-kws-models.yaml @@ -0,0 +1,67 @@ +name: mobile-kws-models + +on: + push: + branches: + - asr-mobile + + workflow_dispatch: + +concurrency: + group: mobile-kws-models-${{ github.ref }} + cancel-in-progress: true + +jobs: + mobile-kws-models: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' || github.repository_owner == 'csu-fangjun' + runs-on: ${{ matrix.os }} + name: ${{ matrix.index }}/${{ matrix.total }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8"] + total: ["2"] + index: ["0", "1"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + shell: bash + run: | + python3 -m pip install onnxruntime==1.16.3 onnx==1.15.0 jinja2 + + - name: Generate build script + shell: bash + run: | + cd scripts/mobile-asr-models + + total=${{ matrix.total }} + index=${{ matrix.index }} + + ./generate-kws.py --total $total --index $index + chmod +x run2.sh + mv run2.sh run.sh + ls -lh + + - name: Run + shell: bash + run: | + cd scripts/mobile-asr-models + ./run.sh + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./kws/*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: kws-models diff --git a/scripts/apk/generate-asr-2pass-apk-script.py b/scripts/apk/generate-asr-2pass-apk-script.py index 9a85f35b4..a90e47205 100755 --- a/scripts/apk/generate-asr-2pass-apk-script.py +++ b/scripts/apk/generate-asr-2pass-apk-script.py @@ -2,7 +2,6 @@ import argparse from dataclasses import dataclass -from typing import List, Optional import jinja2 diff --git a/scripts/apk/generate-asr-apk-script.py b/scripts/apk/generate-asr-apk-script.py index 05a22a921..608532176 100755 --- a/scripts/apk/generate-asr-apk-script.py +++ b/scripts/apk/generate-asr-apk-script.py @@ -2,7 +2,6 @@ import argparse from dataclasses import dataclass -from typing import List, Optional import jinja2 diff --git a/scripts/apk/generate-audio-tagging-apk-script.py b/scripts/apk/generate-audio-tagging-apk-script.py index 1442d0695..033eda5a4 100755 --- a/scripts/apk/generate-audio-tagging-apk-script.py +++ b/scripts/apk/generate-audio-tagging-apk-script.py @@ -2,7 +2,6 @@ import argparse from dataclasses import dataclass -from typing import List, Optional import jinja2 diff --git a/scripts/apk/generate-slid-apk-script.py b/scripts/apk/generate-slid-apk-script.py index a1f9ffae3..9350a1df6 100755 --- a/scripts/apk/generate-slid-apk-script.py +++ b/scripts/apk/generate-slid-apk-script.py @@ -2,7 +2,6 @@ import argparse from dataclasses import dataclass -from typing import List, Optional import jinja2 diff --git a/scripts/apk/generate-speaker-identification-apk-script.py b/scripts/apk/generate-speaker-identification-apk-script.py index d200a8ebb..e242d9819 100755 --- a/scripts/apk/generate-speaker-identification-apk-script.py +++ b/scripts/apk/generate-speaker-identification-apk-script.py @@ -2,7 +2,7 @@ import argparse from dataclasses import dataclass -from typing import List, Optional +from typing import List import jinja2 @@ -34,76 +34,99 @@ class SpeakerIdentificationModel: def get_3dspeaker_models() -> List[SpeakerIdentificationModel]: models = [ - SpeakerIdentificationModel(model_name="3dspeaker_speech_campplus_sv_en_voxceleb_16k.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_eres2net_sv_en_voxceleb_16k.onnx"), - SpeakerIdentificationModel(model_name="3dspeaker_speech_eres2net_sv_zh-cn_16k-common.onnx"), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_campplus_sv_en_voxceleb_16k.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_campplus_sv_zh-cn_16k-common.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_eres2net_base_200k_sv_zh-cn_16k-common.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_eres2net_sv_en_voxceleb_16k.onnx" + ), + SpeakerIdentificationModel( + model_name="3dspeaker_speech_eres2net_sv_zh-cn_16k-common.onnx" + ), ] - prefix = '3dspeaker_speech_' + prefix = "3dspeaker_speech_" num = len(prefix) for m in models: - m.framework = '3dspeaker' + m.framework = "3dspeaker" m.short_name = m.model_name[num:-5] - if '_zh-cn_' in m.model_name: - m.lang = 'zh' - elif '_en_' in m.model_name: - m.lang = 'en' + if "_zh-cn_" in m.model_name: + m.lang = "zh" + elif "_en_" in m.model_name: + m.lang = "en" else: raise ValueError(m) return models + def get_wespeaker_models() -> List[SpeakerIdentificationModel]: models = [ SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_CAM++.onnx"), SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_CAM++_LM.onnx"), - SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_resnet152_LM.onnx"), - SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_resnet221_LM.onnx"), - SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_resnet293_LM.onnx"), + SpeakerIdentificationModel( + model_name="wespeaker_en_voxceleb_resnet152_LM.onnx" + ), + SpeakerIdentificationModel( + model_name="wespeaker_en_voxceleb_resnet221_LM.onnx" + ), + SpeakerIdentificationModel( + model_name="wespeaker_en_voxceleb_resnet293_LM.onnx" + ), SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_resnet34.onnx"), SpeakerIdentificationModel(model_name="wespeaker_en_voxceleb_resnet34_LM.onnx"), SpeakerIdentificationModel(model_name="wespeaker_zh_cnceleb_resnet34.onnx"), SpeakerIdentificationModel(model_name="wespeaker_zh_cnceleb_resnet34_LM.onnx"), ] - prefix = 'wespeaker_xx_' + prefix = "wespeaker_xx_" num = len(prefix) for m in models: - m.framework = 'wespeaker' + m.framework = "wespeaker" m.short_name = m.model_name[num:-5] - if '_zh_' in m.model_name: - m.lang = 'zh' - elif '_en_' in m.model_name: - m.lang = 'en' + if "_zh_" in m.model_name: + m.lang = "zh" + elif "_en_" in m.model_name: + m.lang = "en" else: raise ValueError(m) return models + def get_nemo_models() -> List[SpeakerIdentificationModel]: models = [ - SpeakerIdentificationModel(model_name="nemo_en_speakerverification_speakernet.onnx"), + SpeakerIdentificationModel( + model_name="nemo_en_speakerverification_speakernet.onnx" + ), SpeakerIdentificationModel(model_name="nemo_en_titanet_large.onnx"), SpeakerIdentificationModel(model_name="nemo_en_titanet_small.onnx"), ] - prefix = 'nemo_en_' + prefix = "nemo_en_" num = len(prefix) for m in models: - m.framework = 'nemo' + m.framework = "nemo" m.short_name = m.model_name[num:-5] - if '_zh_' in m.model_name: - m.lang = 'zh' - elif '_en_' in m.model_name: - m.lang = 'en' + if "_zh_" in m.model_name: + m.lang = "zh" + elif "_en_" in m.model_name: + m.lang = "en" else: raise ValueError(m) return models - def main(): args = get_args() index = args.index diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index 22dd5f751..6b7c19d1d 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -2,7 +2,6 @@ import argparse from dataclasses import dataclass -from typing import List, Optional import jinja2 diff --git a/scripts/mobile-asr-models/.gitignore b/scripts/mobile-asr-models/.gitignore new file mode 100644 index 000000000..fc2c46312 --- /dev/null +++ b/scripts/mobile-asr-models/.gitignore @@ -0,0 +1 @@ +run2.sh diff --git a/scripts/mobile-asr-models/README.md b/scripts/mobile-asr-models/README.md index ff8715502..83d25f6a7 100644 --- a/scripts/mobile-asr-models/README.md +++ b/scripts/mobile-asr-models/README.md @@ -16,3 +16,97 @@ https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipform The following [colab notebook](https://colab.research.google.com/drive/1RsVZbsxbPjazeGrNNbZNjXCYbEG2F2DU?usp=sharing) provides examples to use the above two models. + +**WARNING**: Tested with `onnxruntime==1.16.3 onnx==1.15.0`. + +```bash +pip install onnxruntime==1.16.3 onnx==1.15.0 +``` + +## More examples + +### [sherpa-onnx-streaming-zipformer-korean-2024-06-16](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-korean-2024-06-16-korean) + + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 279 MB| 122 MB| +|Batch size fixed to 1| 264 MB | 107 MB | + +### [sherpa-onnx-streaming-zipformer-en-20M-2023-02-17](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-20m-2023-02-17-english) + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 85 MB| 41 MB| +|Batch size fixed to 1| 75 MB | 32 MB | + +### [sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-chinese) + +| | encoder-epoch-20-avg-1-chunk-16-left-128.onnx | encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx| +|---|---|---| +|Dynamic batch size| 249 MB| 67 MB| +|Batch size fixed to 1| 247 MB | 65 MB | + +### [icefall-asr-zipformer-streaming-wenetspeech-20230615](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#pkufool-icefall-asr-zipformer-streaming-wenetspeech-20230615-chinese) + +| | encoder-epoch-12-avg-4-chunk-16-left-128.onnx | encoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx| +|---|---|---| +|Dynamic batch size| 250 MB| 68 MB| +|Batch size fixed to 1| 247 MB | 65 MB | + +### [sherpa-onnx-streaming-zipformer-en-2023-06-26](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-06-26-english) + + +| | encoder-epoch-99-avg-1-chunk-16-left-128.onnx | encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx| +|---|---|---| +|Dynamic batch size| 250 MB| 68 MB| +|Batch size fixed to 1| 247 MB | 65 MB | + +### [sherpa-onnx-streaming-zipformer-en-2023-06-21](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-06-21-english) + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 338 MB| 180 MB| +|Batch size fixed to 1| 264 MB | 107 MB | + +### [sherpa-onnx-streaming-zipformer-en-2023-02-21](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-02-21-english) + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 279 MB| 122 MB| +|Batch size fixed to 1| 264 MB | 107 MB | + +### [sherpa-onnx-streaming-zipformer-fr-2023-04-14](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#shaojieli-sherpa-onnx-streaming-zipformer-fr-2023-04-14-french) + +| | encoder-epoch-29-avg-9-with-averaged-model.onnx | encoder-epoch-29-avg-9-with-averaged-model.int8.onnx| +|---|---|---| +|Dynamic batch size| 279 MB| 121 MB| +|Batch size fixed to 1| 264 MB | 107 MB | + +### [sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16-bilingual-chinese-english) + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 85 MB| 41 MB| +|Batch size fixed to 1| 75 MB | 32 MB | + +### [sherpa-onnx-streaming-zipformer-zh-14M-2023-02-23](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-zh-14m-2023-02-23-chinese) + +| | encoder-epoch-99-avg-1.onnx | encoder-epoch-99-avg-1.int8.onnx| +|---|---|---| +|Dynamic batch size| 40 MB| 21 MB| +|Batch size fixed to 1| 33 MB | 15 MB | + +### [sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01](https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html#sherpa-onnx-kws-zipformer-wenetspeech-3-3m-2024-01-01-chinese) + +| | encoder-epoch-12-avg-2-chunk-16-left-64.onnx | encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx| +|---|---|---| +|Dynamic batch size| 12 MB| 4.6 MB| +|Batch size fixed to 1| 11 MB | 3.9 MB | + +### [sherpa-onnx-kws-zipformer-gigaspeech-3.3M-2024-01-01](https://k2-fsa.github.io/sherpa/onnx/kws/pretrained_models/index.html#sherpa-onnx-kws-zipformer-gigaspeech-3-3m-2024-01-01-english) + +| | encoder-epoch-12-avg-2-chunk-16-left-64.onnx | encoder-epoch-12-avg-2-chunk-16-left-64.int8.onnx| +|---|---|---| +|Dynamic batch size| 12 MB| 4.6 MB| +|Batch size fixed to 1| 11 MB | 3.9 MB | diff --git a/scripts/mobile-asr-models/dynamic_quantization.py b/scripts/mobile-asr-models/dynamic_quantization.py index 80828a823..ec79b84d6 100755 --- a/scripts/mobile-asr-models/dynamic_quantization.py +++ b/scripts/mobile-asr-models/dynamic_quantization.py @@ -1,9 +1,23 @@ #!/usr/bin/env python3 import argparse +import onnxruntime from onnxruntime.quantization import QuantType, quantize_dynamic +def show(filename): + session_opts = onnxruntime.SessionOptions() + session_opts.log_severity_level = 3 + sess = onnxruntime.InferenceSession(filename, session_opts) + for i in sess.get_inputs(): + print(i) + + print("-----") + + for i in sess.get_outputs(): + print(i) + + def get_args(): parser = argparse.ArgumentParser() parser.add_argument( @@ -25,6 +39,9 @@ def get_args(): def main(): args = get_args() print(vars(args)) + print(f"----------{args.input}----------") + show(args.input) + print("------------------------------") quantize_dynamic( model_input=args.input, diff --git a/scripts/mobile-asr-models/generate-asr.py b/scripts/mobile-asr-models/generate-asr.py new file mode 100755 index 000000000..de3ef1960 --- /dev/null +++ b/scripts/mobile-asr-models/generate-asr.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 + +import argparse +from dataclasses import dataclass +import jinja2 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--total", + type=int, + default=1, + help="Number of runners", + ) + parser.add_argument( + "--index", + type=int, + default=0, + help="Index of the current runner", + ) + return parser.parse_args() + + +@dataclass +class Model: + # We will download + # https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/{model_name}.tar.bz2 + model_name: str + + cmd: str + + +def get_streaming_zipformer_transducer_models(): + models = [ + Model( + model_name="sherpa-onnx-streaming-zipformer-korean-2024-06-16", + cmd=""" + ./run-impl.sh \ + --input $src/encoder-epoch-99-avg-1.onnx \ + --output1 $dst/encoder-epoch-99-avg-1.onnx \ + --output2 $dst/encoder-epoch-99-avg-1.int8.onnx + + cp -v $src/bpe.model $dst/ || true + cp -v $src/tokens.txt $dst/ + cp -av $src/test_wavs $dst/ + cp -v $src/decoder-epoch-99-avg-1.onnx $dst/ + cp -v $src/joiner-epoch-99-avg-1.int8.onnx $dst/ + + cat > $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md < $dst/notes.md <