config.yaml

exp_name: 'multi'
gpu: 'cpu'
run_debug_eval: false
logger:
  offline: false
  wandb_key: 


tts:
  weights_path: './pretrained/290000.pth.tar'
  restore_step: 0  
  
hifi:
  weights_path: './pretrained/hifi.pth'
  MAX_WAV_VALUE: 32768
  resblock: "1"
  num_gpus: 0
  batch_size: 8
  learning_rate: 0.0002
  adam_b1: 0.8
  adam_b2: 0.99
  lr_decay: 0.999
  seed: 1234

  upsample_rates: [8,8,2,2]
  upsample_kernel_sizes: [16,16,4,4]
  upsample_initial_channel: 512
  resblock_kernel_sizes: [3,7,11]
  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
  resblock_initial_channel: 256

  segment_size: 8192
  num_mels: 80
  num_freq: 1025
  n_fft: 1024
  hop_size: 256
  win_size: 1024
  sampling_rate: 22050


train_config:
  path:
    ckpt_path: "../output/ckpt/multi_final"
    result_path: "../output/result/multi_final"
  optimizer:
    batch_size: 16
    betas: [0.95, 0.999]
    eps: 0.00001
    weight_decay: 0.0
    grad_clip_thresh: 1.0
    grad_acc_step: 4
    warm_up_step: 4000
    anneal_steps: [300000, 400000, 500000]
    anneal_rate: 0.7
  step:
    total_step: 900000
    log_step: 100
    synth_step: 1000
    val_step: 1000
    save_step: 5000
  
  max_masks_per_sentence: 0.15

preprocess_config:
  dataset: "MAIN"

  path:
    lexicon_path: "./rus_all.dict"  
    raw_path: "./speakers"
    preprocessed_path: "./processed"


  preprocessing:
    val_size: 512
    text:
      text_cleaners: []
      language: "ru"
    audio:
      sampling_rate: 22050
      max_wav_value: 32768.0
    stft:
      filter_length: 1024
      hop_length: 256
      win_length: 1024
    mel:
      n_mel_channels: 80
      mel_fmin: 0
      mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
    pitch:
      feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
      normalization: True
    energy:
      feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
      normalization: True

model_config:
  transformer:
    encoder_layer: 4
    encoder_head: 2
    encoder_hidden: 256
    variance_hidden: 256
    decoder_layer: 6
    decoder_head: 2
    decoder_hidden: 256
    conv_filter_size: 1024
    conv_kernel_size: [9, 1]
    encoder_dropout: 0.2
    decoder_dropout: 0.2

  variance_predictor:
    filter_size: 256
    kernel_size: 3
    dropout: 0.5
  use_cwt: False
  variance_embedding:
    pitch_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
    energy_quantization: "linear" # support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
    n_bins: 256

  multi_speaker: True

  max_seq_len: 1000

  vocoder:
    model: "HiFi-GAN" # support 'HiFi-GAN', 'MelGAN'
    speaker: "universal" # support  'LJSpeech', 'universal'
    use_cpu: true