nlp/bert/pytorch/benchmarks.yml

---
# --- Pretraining ---
pretrain_options: &pretrain_options
   data:
      throughput:
         regexp: 'throughput: *(.*?) samples/sec'
      mlm_acc:
         regexp: 'mlm_acc: *(.*?) \%'
         reduction_type: "final"
      nsp_acc:
         regexp: 'nsp_acc: *(.*?) \%'
         reduction_type: "final"
      nsp_loss:
         regexp: 'nsp_loss: *(\d*\.\d*)'
         reduction_type: "final"
      mlm_loss:
         regexp: 'mlm_loss: *(\d*\.\d*)'
         reduction_type: "final"
      loss:
         regexp: 'total loss: *(\d*\.\d*)'
         reduction_type: "final"
   output:
      - [samples/sec, "throughput"]
      - [loss, "loss"]

config_options: &config_options
   requirements_path: requirements.txt
   required_apt_packages_path: required_apt_packages.txt


# POD4
pytorch_bert_base_pretrain_real_pod4:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT-Base pretraining benchmark on real data. Phase 1 and phase 2.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_base_{phase}_pod4
         --training-steps 10
         --input-file $DATASETS_DIR/wikipedia/{phase}/wiki_1[0-1]*.tfrecord
         --disable-progress-bar

pytorch_bert_large_pretrain_real_pod4:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT-Large pretraining benchmark on real data. Phase 1 and phase 2.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_{phase}_pod4
         --training-steps 10
         --input-file $DATASETS_DIR/wikipedia/{phase}/wiki_1[0-1]*.tfrecord
         --disable-progress-bar

pytorch_bert_large_packed_pretrain_real_pod4:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 1 and 2 with real data on 4 IPUs
      for performance testing.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_{phase}_pod4
         --training-steps 10
         --input-files $DATASETS_DIR/wikipedia/torch_bert/packed_{phase}/wiki_000.tfrecord
         --disable-progress-bar
         --packed-data


# POD16
pytorch_bert_base_pretrain_real_pod16:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT-Base pretraining benchmark on real data. Phase 1 and phase 2.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_base_{phase}
         --training-steps 10
         --input-file $DATASETS_DIR/wikipedia/{phase}/wiki_1[0-1]*.tfrecord
         --disable-progress-bar

pytorch_bert_large_pretrain_real_pod16:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT-Large pretraining benchmark on real data. Phase 1 and phase 2.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_{phase}
         --training-steps 10
         --input-file $DATASETS_DIR/wikipedia/{phase}/wiki_1[0-1]*.tfrecord
         --disable-progress-bar

pytorch_bert_large_packed_pretrain_real_pod16:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 1 and 2 with real data on 16 IPUs
      for performance testing.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_{phase}
         --training-steps 10
         --input-files $DATASETS_DIR/wikipedia/torch_bert/packed_{phase}/wiki_000.tfrecord
         --disable-progress-bar
         --packed-data

# POD64
pytorch_bert_large_packed_pretrain_real_pod64:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 1 and 2 with real data on 16 IPUs
      for performance testing.
   parameters:
      phase: 128,512
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_{phase}_POD64
         --training-steps 10
         --input-files $DATASETS_DIR/wikipedia/psc_{phase}/wiki_000.tfrecord
         --disable-progress-bar
         --packed-data

pytorch_bert_large_sl128_pretrain_real_pod64_conv:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 1 with real data on 64 IPUs
      for convergence testing.
   cmd: >-
      poprun
         --vv
         --num-instances 1
         --num-replicas 16
         --update-partition=yes
         --remove-partition=yes
         --reset-partition=no
         --sync-type=ST_POD_NATIVE_DEFAULT
         --vipu-server-timeout 400
         --vipu-server-host $IPUOF_VIPU_API_HOST
         --vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
         --vipu-allocation=$VIPU_ALLOCATION_ID
         --ipus-per-replica 4
         --mpi-global-args="
         --mca oob_tcp_if_include $TCP_IF_INCLUDE
         --mca btl_tcp_if_include $TCP_IF_INCLUDE"
         --mpi-local-args="
         -x OPAL_PREFIX
         -x LD_LIBRARY_PATH
         -x PATH
         -x PYTHONPATH
         -x IPUOF_VIPU_API_TIMEOUT=400
         -x POPLAR_LOG_LEVEL=WARN
         -x DATASETS_DIR
         -x POPLAR_ENGINE_OPTIONS
         -x POPLAR_TARGET_OPTIONS"
      python3 run_pretraining.py
         --config pretrain_large_128_POD64
         --input-file $DATASETS_DIR/wikipedia/128/*.tfrecord
         --disable-progress-bar
         --checkpoint-output-dir checkpoint/phase1
         --wandb

pytorch_bert_large_sl512_pretrain_real_pod64_conv:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 2 with real data on 64 IPUs
      for convergence testing.
   cmd: >-
      poprun
         --vv
         --num-instances 1
         --num-replicas 16
         --update-partition=yes
         --remove-partition=yes
         --reset-partition=no
         --sync-type=ST_POD_NATIVE_DEFAULT
         --vipu-server-timeout 400
         --vipu-server-host $IPUOF_VIPU_API_HOST
         --vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
         --vipu-allocation=$VIPU_ALLOCATION_ID
         --ipus-per-replica 4
         --mpi-global-args="
         --mca oob_tcp_if_include $TCP_IF_INCLUDE
         --mca btl_tcp_if_include $TCP_IF_INCLUDE"
         --mpi-local-args="
         -x OPAL_PREFIX
         -x LD_LIBRARY_PATH
         -x PATH
         -x PYTHONPATH
         -x IPUOF_VIPU_API_TIMEOUT=400
         -x POPLAR_LOG_LEVEL=WARN
         -x DATASETS_DIR
         -x POPLAR_ENGINE_OPTIONS
         -x POPLAR_TARGET_OPTIONS"
      python3 run_pretraining.py
         --config pretrain_large_512_POD64
         --input-file $DATASETS_DIR/wikipedia/512/*.tfrecord
         --disable-progress-bar
         --checkpoint-output-dir checkpoint/phase2
         --checkpoint-input-dir checkpoint/phase1
         --wandb

pytorch_bert_large_packed_sl128_pretrain_real_pod64_conv:
   <<: [*pretrain_options, *config_options]
   description: |
      BERT Large pretraining phase 1 with real data on 64 IPUs
      for convergence testing.
   cmd: >-
      python3 run_pretraining.py
         --config pretrain_large_128_POD64
         --input-files $DATASETS_DIR/wikipedia/psc_128/*.tfrecord
         --disable-progress-bar
         --checkpoint-output-dir checkpoint/phase1
         --wandb
         --packed-data

pytorch_bert_large_packed_sl512_pretrain_real_pod64_conv:
  <<: [*pretrain_options, *config_options]
  description: |
    BERT Large pretraining phase 2 (SL 512) with real data on 64 IPUs
    for convergence testing.
  cmd: >-
    python3 run_pretraining.py
        --config pretrain_large_512_POD64
        --input-file $DATASETS_DIR/wikipedia/psc_512/*.tfrecord
        --disable-progress-bar
        --checkpoint-output-dir checkpoint/phase2
        --checkpoint-input-dir checkpoint/phase1
        --wandb
        --packed-data

# --- SQuAD ---
squad_options: &squad_options
   data:
      throughput:
         regexp: 'throughput: *(.*?) samples\/sec'
      loss:
         regexp: 'loss: *(\d*\.\d*)'
         reduction_type: "final"
   output:
      - [samples/sec, "throughput"]
      - [loss, "loss"]

pytorch_bert_squad_large_pretrain_real_pod16:
   <<: [*squad_options, *config_options]
   description: |
      BERT Large SQuAD benchmark on real data.
   parameters:
      phase: 384
   cmd: >-
      python3 run_squad.py
         --squad-do-validation False
         --config squad_large_{phase}
         --num-epochs 1

pytorch_bert_squad_large_finetune_real_pod16_conv:
   <<: [*squad_options, *config_options]
   description: |
      BERT Large SQuAD finetuning on real data. Vaalidation Included.
   cmd: >-
      python3 run_squad.py
         --squad-do-validation True
         --config squad_large_384
         --checkpoint-input-dir checkpoint/phase2
         --wandb


pytorch_bert_squad_large_infer_gen_pod16:
  <<: *config_options
  description: |
    BERT Large SQuAD in inference.
  data:
    throughput:
      regexp: 'throughput: *(.*?) samples\/sec'
  output:
    - [samples/sec, 'throughput']
  cmd: >-
    python3 run_squad.py
      --config squad_large_384
      --squad-do-training False
      --dataset generated

pytorch_bert_squad_large_tritonserver_infer_gen_pod16:
  <<: *config_options
  description: |
    BERT Large SQuAD in inference hosted by Triton server.
  data:
    throughput:
      regexp: 'throughput: *(.*?) samples\/sec'
    latency:
      regexp: 'latency: *(.*?) ms \(mean\)'
  output:
    - [samples/sec, 'throughput']
    - [latency(ms), 'latency']
  parameters:
    request_type: SYNC,ASYNC
    parallel_processes: 1,4,8
  cmd: >-
    python3 run_benchmark_with_triton_server.py
      -s
      -k test_single_model[bert-squad_large_384-RequestType.{request_type}-{parallel_processes}]
      --benchmark_only=true
      ./tests_serial/tritonserver/