nlp/bert/pytorch/configs_squad.yml

# Copyright (c) 2021 Graphcore Ltd. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#----------------------------------------------------------------------------------
defaults: &defaults
  random_seed: 1984
  vocab_size: 30522
  dataloader_workers: 64
  async_dataloader: True
  ipus_per_replica: 4
  custom_ops: True
  synthetic_data: False
  optimizer: AdamW
  auto_loss_scaling: False
  weight_decay: 0.0
  embedding_serialization_factor: 1
  recompute_checkpoint_every_layer: True
  replicated_tensor_sharding: False
  optimizer_state_offchip: True
  enable_half_first_order_momentum: False
  enable_half_partials: True
  layer_norm_eps: 0.001
  attention_probs_dropout_prob: 0.0
  mask_tokens: 20
  dataset: squad
  sequence_length: 128
  num_epochs: 2
  packed_data: False
#----------------------------------------------------------------------------------

#----------------------------------------------------------------------------------
demo_tiny_128: &demo_tiny_128
  <<: *defaults

  # Execution
  micro_batch_size: 1
  training_steps: 150
  device_iterations: 1
  replication_factor: 1
  gradient_accumulation: 16
  dataloader_workers: 1

  # Model
  hidden_size: 64
  num_hidden_layers: 3
  num_attention_heads: 8
  sequence_length: 128
  mask_tokens: 20
  layers_per_ipu: [0, 1, 1, 1]
  matmul_proportion: 0.6

  # Optimizer
  lr_warmup: 0.0
  lr_schedule: constant
  learning_rate: 0.001
  loss_scaling: 16.0
  weight_decay: 0.0

  # Dataset
  input_files: ["data/*.tfrecord"]
#----------------------------------------------------------------------------------

#----------------------------------------------------------------------------------
squad_base_384: &squad_base_384
  <<: *defaults

  # Execution
  micro_batch_size: 1
  num_epochs: 3
  device_iterations: 2
  replication_factor: 4
  gradient_accumulation: 8
  executable_cache_dir: "./exe_cache"
  replicated_tensor_sharding: True
  optimizer_state_offchip: False
  enable_half_first_order_momentum: True
  dataset: squad

  # Model
  vocab_size: 30522
  hidden_size: 768
  num_hidden_layers: 12
  num_attention_heads: 12
  sequence_length: 384
  layers_per_ipu: [0, 4, 4, 4]
  matmul_proportion: 0.25
  attention_probs_dropout_prob: 0.1

  # Optimizer
  learning_rate: 6e-5
  lr_schedule: linear
  loss_scaling: 64.0
  weight_decay: 0.01
  lr_warmup: 0.25
  optimizer: "AdamW"
#----------------------------------------------------------------------------------

#----------------------------------------------------------------------------------
# A 16IPU optimized squad_large configuration that relies on replicatedTensorSharding
# to reduce the optimizer state such that it fits on chip.
# NB This requires replication_factor=2 to fit.
squad_large_384:
  <<: *squad_base_384

  # Execution
  micro_batch_size: 4
  device_iterations: 2
  gradient_accumulation: 32
  replication_factor: 2
  num_epochs: 3

  # Model
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  attention_probs_dropout_prob: 0.15
  ipus_per_replica: 8
  layers_per_ipu: [2,3,3,3,3,3,3,4]
  matmul_proportion: [0.08,0.25,0.32,0.32,0.36,0.38,0.4,0.32]
  optimizer_state_offchip: False
  enable_half_first_order_momentum: True

  # Optimizer
  loss_scaling: 64.0
  learning_rate: 5e-5
  lr_warmup: 0.10
#----------------------------------------------------------------------------------

#----------------------------------------------------------------------------------
# SQuAD BERT Large with 4 IPU pipeline and 4 replicas
squad_large_384_4x4:
  <<: *squad_base_384

  # Execution
  micro_batch_size: 2
  device_iterations: 1
  gradient_accumulation: 64
  replication_factor: 4
  num_epochs: 3

  # Model
  hidden_size: 1024
  num_hidden_layers: 24
  num_attention_heads: 16
  attention_probs_dropout_prob: 0.15
  ipus_per_replica: 4
  layers_per_ipu: [3, 7, 7, 7]
  matmul_proportion: [0.15, 0.25, 0.25, 0.25]
  optimizer_state_offchip: True
  enable_half_first_order_momentum: True

  # Optimizer
  loss_scaling: 64.0
  learning_rate: 5e-5
  lr_warmup: 0.10
#----------------------------------------------------------------------------------