Saforem2's workspace
Runs
1,333
Name
6 visualized
MODEL_SIZE: GPT1T_2L
MODEL_SIZE: GPT1T_2L
1
2
MODEL_SIZE: GPT1T_4L
MODEL_SIZE: GPT1T_4L
1
1
MODEL_SIZE: GPT1T_16L
MODEL_SIZE: GPT1T_16L
1
1
MODEL_SIZE: GPT1T_32L
MODEL_SIZE: GPT1T_32L
1
1
MODEL_SIZE: GPT1T_64L
MODEL_SIZE: GPT1T_64L
1
1
1-5
of 5Throughput/SamplesPerSec (large graph)
5
throughput
11
system/gpu.0.memoryAllocated, system/gpu.1.memoryAllocated, system/gpu.2.memoryAllocated, system/gpu.3.memoryAllocated
system/gpu.0.memoryAllocated, system/gpu.1.memoryAllocated, system/gpu.2.memoryAllocated, system/gpu.3.memoryAllocated
MODEL_SIZE: GPT1T_2L, env.MACHINE: Perlmutter, world_size: 8, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 8, global_batch_size: 16, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 1, seq_length: 2048, use_flash_attn: false, env.GAS: 8 GPU 0 Memory Allocated (%)
MODEL_SIZE: GPT1T_2L, env.MACHINE: Perlmutter, world_size: 8, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 8, global_batch_size: 16, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 1, seq_length: 2048, use_flash_attn: false, env.GAS: 8 GPU 1 Memory Allocated (%)
MODEL_SIZE: GPT1T_2L, env.MACHINE: Perlmutter, world_size: 8, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 8, global_batch_size: 16, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 1, seq_length: 2048, use_flash_attn: false, env.GAS: 8 GPU 2 Memory Allocated (%)
MODEL_SIZE: GPT1T_2L, env.MACHINE: Perlmutter, world_size: 8, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 8, global_batch_size: 16, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 1, seq_length: 2048, use_flash_attn: false, env.GAS: 8 GPU 3 Memory Allocated (%)
MODEL_SIZE: GPT1T_4L, env.MACHINE: Perlmutter, world_size: 16, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 16, global_batch_size: 32, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 2, seq_length: 2048, use_flash_attn: false, env.GAS: 16 GPU 0 Memory Allocated (%)
MODEL_SIZE: GPT1T_4L, env.MACHINE: Perlmutter, world_size: 16, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 16, global_batch_size: 32, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 2, seq_length: 2048, use_flash_attn: false, env.GAS: 16 GPU 1 Memory Allocated (%)
MODEL_SIZE: GPT1T_4L, env.MACHINE: Perlmutter, world_size: 16, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 16, global_batch_size: 32, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 2, seq_length: 2048, use_flash_attn: false, env.GAS: 16 GPU 2 Memory Allocated (%)
MODEL_SIZE: GPT1T_4L, env.MACHINE: Perlmutter, world_size: 16, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 16, global_batch_size: 32, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 2, seq_length: 2048, use_flash_attn: false, env.GAS: 16 GPU 3 Memory Allocated (%)
MODEL_SIZE: GPT1T_16L, env.MACHINE: Perlmutter, world_size: 64, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 64, global_batch_size: 128, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 8, seq_length: 2048, use_flash_attn: false, env.GAS: 64 GPU 0 Memory Allocated (%)
MODEL_SIZE: GPT1T_16L, env.MACHINE: Perlmutter, world_size: 64, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 64, global_batch_size: 128, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 8, seq_length: 2048, use_flash_attn: false, env.GAS: 64 GPU 1 Memory Allocated (%)
MODEL_SIZE: GPT1T_16L, env.MACHINE: Perlmutter, world_size: 64, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 64, global_batch_size: 128, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 8, seq_length: 2048, use_flash_attn: false, env.GAS: 64 GPU 2 Memory Allocated (%)
MODEL_SIZE: GPT1T_16L, env.MACHINE: Perlmutter, world_size: 64, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 64, global_batch_size: 128, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 8, seq_length: 2048, use_flash_attn: false, env.GAS: 64 GPU 3 Memory Allocated (%)
MODEL_SIZE: GPT1T_32L, env.MACHINE: Perlmutter, world_size: 128, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 128, global_batch_size: 256, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 16, seq_length: 2048, use_flash_attn: false, env.GAS: 128 GPU 0 Memory Allocated (%)
MODEL_SIZE: GPT1T_32L, env.MACHINE: Perlmutter, world_size: 128, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 128, global_batch_size: 256, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 16, seq_length: 2048, use_flash_attn: false, env.GAS: 128 GPU 1 Memory Allocated (%)
MODEL_SIZE: GPT1T_32L, env.MACHINE: Perlmutter, world_size: 128, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 128, global_batch_size: 256, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 16, seq_length: 2048, use_flash_attn: false, env.GAS: 128 GPU 2 Memory Allocated (%)
MODEL_SIZE: GPT1T_32L, env.MACHINE: Perlmutter, world_size: 128, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 128, global_batch_size: 256, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 16, seq_length: 2048, use_flash_attn: false, env.GAS: 128 GPU 3 Memory Allocated (%)
MODEL_SIZE: GPT1T_64L, env.MACHINE: Perlmutter, world_size: 256, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 256, global_batch_size: 512, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 32, seq_length: 2048, use_flash_attn: false, env.GAS: 256 GPU 0 Memory Allocated (%)
MODEL_SIZE: GPT1T_64L, env.MACHINE: Perlmutter, world_size: 256, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 256, global_batch_size: 512, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 32, seq_length: 2048, use_flash_attn: false, env.GAS: 256 GPU 1 Memory Allocated (%)
MODEL_SIZE: GPT1T_64L, env.MACHINE: Perlmutter, world_size: 256, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 256, global_batch_size: 512, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 32, seq_length: 2048, use_flash_attn: false, env.GAS: 256 GPU 2 Memory Allocated (%)
MODEL_SIZE: GPT1T_64L, env.MACHINE: Perlmutter, world_size: 256, micro_batch_size: 2, deepspeed_configuration.gradient_accumulation_steps: 256, global_batch_size: 512, zero_stage: 1, env.MPSIZE: 8, env.PPSIZE: 32, seq_length: 2048, use_flash_attn: false, env.GAS: 256 GPU 3 Memory Allocated (%)
Panel Section
2
Run Comparisons
4
Train
8
Charts
1
timers
2
System
33