Skip to main content

Adafactor learning-rate 0.005 seems best for t5-base training

TL;DR: In a comparison between Adafactor, AdamW and Distributed Shampoo for training t5-base, Adafactor without gradient accumulation seems to be converge fastest. Due to a difference in which the optimizer is logged in two scripts I am using, I cannot change the legend to show the optimizer without displaying false information. All the runs that converge below loss 4 after a ~2 hours are Adafactor runs without gradient accumulation. Peach-sweep-6 is the best of these, with lr set to 5e-3
Created on February 19|Last edited on February 14

01234Time (minutes)
01234Time (minutes)-2-1012
3.23.43.63.84Time (minutes)405060708090100
Run set
43


sweep-pjit-t5-base-adafactor-ga1-wu10k.yaml

program: "../train/run_t5_mlm_flax_pjit.py"
entity: yepster
project: tpu-t5-base
method: random
#early_terminate:
#  type: hyperband
#  min_iter: 3
metric:
  name: eval/loss
  goal: minimize
parameters:
  learning_rate:
    distribution: log_uniform
    # from exp(min) to exp(max), ie 1e-5 to 9e-3 on log scale
    min: -8.5
    max: -5.3
  gradient_accumulation_steps:
    value: 1
  warmup_steps:
    # multiplied by gradient accumulation
    value: 10000
#  weight_decay:
#    distribution: log_uniform
#    min: -9
#    max: -3
command:
  - python3
  - ${program}
  - "--output_dir"
  - "./output_sweep"
  - "--model_type"
  - "t5"
  - "--config_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--tokenizer_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--preprocessing_num_workers"
  - 96
  - "--optim"
  - "adafactor"
  - "--do_train"
  - "--do_eval"
  - "--dataset_name"
  - "yhavinga/mc4_nl_cleaned"
  - "--dataset_config_name"
  - "tiny"
  - "--max_seq_length"
  - 512
  - "--per_device_train_batch_size"
  - 8
  - "--per_device_eval_batch_size"
  - 16
  - "--gradient_accumulation_steps"
  - 1
  - "--learning_rate"
  - "5e-3"
  - "--overwrite_output_dir"
  - "--num_train_epochs"
  - 1
  - "--logging_steps"
  - 200
  - "--save_steps"
  - 80000
  - "--eval_steps"
  - 2400
  - "--warmup_steps"
  - 10000
  - "--max_train_samples"
  - 2400000
  - ${args}



sweep-pjit-t5-base-shampoo-ga1-wu10k.yaml

program: "../train/run_t5_mlm_flax_pjit.py"
entity: yepster
project: tpu-t5-base
method: random
#early_terminate:
#  type: hyperband
#  min_iter: 3
metric:
  name: eval/loss
  goal: minimize
parameters:
  learning_rate:
    distribution: log_uniform
    # from exp(min) to exp(max), ie 1e-5 to 9e-3 on log scale
    min: -9.2
    max: -6.9
  gradient_accumulation_steps:
    value: 1
  warmup_steps:
    # multiplied by gradient accumulation
    value: 10000
#  weight_decay:
#    distribution: log_uniform
#    min: -9
#    max: -3
command:
  - python3
  - ${program}
  - "--output_dir"
  - "./output_sweep"
  - "--model_type"
  - "t5"
  - "--config_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--tokenizer_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--preprocessing_num_workers"
  - 96
  - "--optim"
  - "distributed_shampoo"
  - "--do_train"
  - "--do_eval"
  - "--dataset_name"
  - "yhavinga/mc4_nl_cleaned"
  - "--dataset_config_name"
  - "tiny"
  - "--max_seq_length"
  - 512
  - "--per_device_train_batch_size"
  - 8
  - "--per_device_eval_batch_size"
  - 16
  - "--gradient_accumulation_steps"
  - 1
  - "--learning_rate"
  - "5e-3"
  - "--overwrite_output_dir"
  - "--num_train_epochs"
  - 1
  - "--logging_steps"
  - 200
  - "--save_steps"
  - 80000
  - "--eval_steps"
  - 2400
  - "--warmup_steps"
  - 10000
  - "--max_train_samples"
  - 2400000
  - ${args}



sweep-pmap-t5-base-adafactor-ga1-wu10k.yaml

program: "../train/run_t5_mlm_flax_pmap.py"
entity: yepster
project: tpu-t5-base
method: random
#early_terminate:
#  type: hyperband
#  min_iter: 3
metric:
  name: eval/loss
  goal: minimize
parameters:
  learning_rate:
    distribution: log_uniform
    # from exp(min) to exp(max), ie 1e-5 to 9e-3 on log scale
    min: -8
    max: -5.3
  gradient_accumulation_steps:
    value: 1
  warmup_steps:
    # multiplied by gradient accumulation
    value: 10000
#  weight_decay:
#    distribution: log_uniform
#    min: -9
#    max: -3
command:
  - python3
  - ${program}
  - "--output_dir"
  - "./output_sweep"
  - "--model_type"
  - "t5"
  - "--config_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--tokenizer_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--preprocessing_num_workers"
  - 96
  - "--adafactor"
  - "--do_train"
  - "--do_eval"
  - "--dataset_name"
  - "yhavinga/mc4_nl_cleaned"
  - "--dataset_config_name"
  - "tiny"
  - "--max_seq_length"
  - 512
  - "--per_device_train_batch_size"
  - 8
  - "--per_device_eval_batch_size"
  - 16
  - "--gradient_accumulation_steps"
  - 1
  - "--learning_rate"
  - "5e-3"
  - "--overwrite_output_dir"
  - "--num_train_epochs"
  - 1
  - "--logging_steps"
  - 200
  - "--save_steps"
  - 80000
  - "--eval_steps"
  - 2400
  - "--warmup_steps"
  - 10000
  - "--max_train_samples"
  - 2400000
  - ${args}



sweep-pmap-t5-base-adafactor-ga8-wu10k.yaml

program: "../train/run_t5_mlm_flax_pmap.py"
entity: yepster
project: tpu-t5-base
method: random
#early_terminate:
#  type: hyperband
#  min_iter: 3
metric:
  name: eval/loss
  goal: minimize
parameters:
  learning_rate:
    min: 0.001
    max: 0.02
  gradient_accumulation_steps:
    value: 8
  warmup_steps:
    # multiplied by gradient accumulation
    value: 1250
#  weight_decay:
#    distribution: log_uniform
#    min: -9
#    max: -3
command:
  - python3
  - ${program}
  - "--output_dir"
  - "./output_sweep"
  - "--model_type"
  - "t5"
  - "--config_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--tokenizer_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--preprocessing_num_workers"
  - 96
  - "--adafactor"
  - "--do_train"
  - "--do_eval"
  - "--dataset_name"
  - "yhavinga/mc4_nl_cleaned"
  - "--dataset_config_name"
  - "tiny"
  - "--max_seq_length"
  - 512
  - "--per_device_train_batch_size"
  - 8
  - "--per_device_eval_batch_size"
  - 16
  - "--gradient_accumulation_steps"
  - 1
  - "--learning_rate"
  - "5e-3"
  - "--overwrite_output_dir"
  - "--num_train_epochs"
  - 1
  - "--logging_steps"
  - 25
  - "--save_steps"
  - 80000
  - "--eval_steps"
  - 300
  - "--warmup_steps"
  - 1250
  - "--max_train_samples"
  - 2400000
  - ${args}



sweep-pmap-t5-base-adam.yaml

program: "../train/run_t5_mlm_flax_pmap.py"
entity: yepster
project: tpu-t5-base
method: random
#early_terminate:
#  type: hyperband
#  min_iter: 3
metric:
  name: eval/loss
  goal: minimize
parameters:
  learning_rate:
    distribution: log_uniform
    # from exp(min) to exp(max), ie 1e-5 to 9e-3 on log scale
    min: -8
    max: -4.5
  gradient_accumulation_steps:
    value: 8
  warmup_steps:
    # multiplied by gradient accumulation
    value: 1000
command:
  - python3
  - ${program}
  - "--output_dir"
  - "./output_sweep"
  - "--model_type"
  - "t5"
  - "--config_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--tokenizer_name"
  - "yhavinga/t5-v1.1-base-dutch-uncased"
  - "--preprocessing_num_workers"
  - 96
  - "--do_train"
  - "--do_eval"
  - "--dataset_name"
  - "yhavinga/mc4_nl_cleaned"
  - "--dataset_config_name"
  - "tiny"
  - "--max_seq_length"
  - 512
  - "--per_device_train_batch_size"
  - 8
  - "--per_device_eval_batch_size"
  - 8
  - "--gradient_accumulation_steps"
  - 8
  - "--learning_rate"
  - "5e-3"
  - "--overwrite_output_dir"
  - "--num_train_epochs"
  - 1
  - "--logging_steps"
  - 25
  - "--save_steps"
  - 80000
  - "--eval_steps"
  - 200
  - "--weight_decay"
  - "0.001"
  - "--warmup_steps"
  - 1000
  - "--max_train_samples"
  - 2400000
  - ${args}


N B
N B •  
Is it possible to share the AdaFactor() arguments and the Trainer arguments?
1 reply