Kmfoda's workspace
Runs
5
Name
5 visualized
State
Notes
User
Tags
Created
Runtime
Sweep
code
learning_rate
perplexity
step
tokens_per_sec
train_loss
grad_norm_after_outer
outer_step
Crashed
-
kmfoda
8h 46m 34s
-
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import time
import math
import torch
import wandb
from contextlib import contextmanager, nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
AutoTokenizer,
LlamaForCausalLM,
get_cosine_schedule_with_warmup,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets.distributed import split_dataset_by_node
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast
from demo import DeMo
def get_grad_norm(parameters):
"""Helper function to compute gradient norm"""
total_norm = 0.0
for p in parameters:
if p.grad is not None:
param_norm = p.grad.detach().data.norm(2)
total_norm += param_norm.item() ** 2
return total_norm ** 0.5
def get_offloaded_param(outer_optimizer: torch.optim.Optimizer):
return [
param.data.detach().clone().to("cpu") for group in outer_optimizer.param_groups for param in group["params"]
]
# Initialize distributed training
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
else:
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.deterministic = False
# Training hyperparameters
MODEL_NAME = "PrimeIntellect/llama-150m-fresh"
RUN_NAME = "Open_DeMoCo_Copy_150m_AdamW+NoSignum_500Steps_andClip"
B = 16 # batch size per GPU
T = 1024 # sequence length
batch_size = 512 # total tokens across all GPUs
max_lr = 4e-4
min_lr = max_lr * 0.1
warmup_steps = 1000
max_steps = 88_000
local_steps = 500
assert batch_size % B == 0
gradient_accumulation_steps = batch_size // B
wandb.init(
project="DeMoCo",
entity="kmfoda",
name=RUN_NAME,
config={
"code": code # Add the code to the config
}
)
# Load model and tokenizer
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(device)
# model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K", use_fast=True)
tokenizer.pad_token = "</s>"
# Load and prepare dataset
ds = load_dataset(
"allenai/c4",
"en",
streaming=True,
data_files={
"train": "en/c4-train.*.json.gz",
"validation": "en/c4-validation.00000-of-00008.json.gz",
},
)
def tokenize_function(data):
return tokenizer(data["text"], truncation=True, max_length=T)
tokenized_datasets = ds.map(
tokenize_function,
batched=True,
remove_columns=["text", "timestamp", "url"],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = split_dataset_by_node(tokenized_datasets["train"], world_size=ddp_world_size, rank=ddp_rank)
train_dataloader = DataLoader(
train_dataset,
collate_fn=data_collator,
batch_size=B,
pin_memory=True,
num_workers=4,
prefetch_factor=2,
)
val_dataset = tokenized_datasets["validation"]
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=B)
# Initialize optimizers
inner_optimizer = torch.optim.AdamW(model.parameters(),
lr=max_lr,
weight_decay=0.1,
betas=(0.9, 0.95),
fused=True
)
# TODO REMOVE SIGNUN + CHECK OTHER SGD PARAMS
outer_optimizer = DeMo(
model.parameters(),
lr=0.7,
compression_decay=0.999, # Default values, adjust as needed
compression_topk=32,
compression_chunk=64,
process_group=None, # Will be set automatically by DDP
)
scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
for param in model.parameters():
# this make sure all device have the same weight init
dist.broadcast(param.data, src=0)
params_offloaded = get_offloaded_param(outer_optimizer)
model.train()
loss_accum = 0.0 # Move outside the loop to accumulate over all grad steps
for step, batch in enumerate(train_dataloader):
t0 = time.time()
real_step = (step + 1) // gradient_accumulation_steps
step_within_grad_acc = (step + 1) % gradient_accumulation_steps
batch = {k: v.to(device) for k, v in batch.items()}
if ddp:
with model.no_sync():
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
if master_process:
print(f"Outer Step: {real_step // local_steps} - Inner Step: {step} - Loss: {outputs.loss}")
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
else:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
if step_within_grad_acc == 0: # Only update after accumulating all gradients
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
inner_optimizer.step()
scheduler.step()
inner_optimizer.zero_grad(set_to_none=True)
# Outer optimization step every local_steps
if real_step % local_steps == 0 and real_step != 0:
if master_process:
print(f"perform outer step at step {real_step}")
main_param = [param for group in inner_optimizer.param_groups for param in group["params"]]
for param_offloaded, param in zip(params_offloaded, main_param):
param_offloaded_on_device = param_offloaded.data.to(param.device)
param.grad = param_offloaded_on_device - param.data
# dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG)
param.data = param_offloaded_on_device
# here we don't call scaler.step. Indeed the scaler has already done his work (scaling down the gradients) with the optimizer.step call
outer_optimizer.step()
outer_optimizer.zero_grad()
params_offloaded = get_offloaded_param(outer_optimizer)
if master_process:
# Log gradient norms after outer step
grad_norm_after = get_grad_norm(model.parameters())
print(f"Gradient norm after outer step: {grad_norm_after:.6f}")
wandb.log({
"grad_norm_after_outer": grad_norm_after,
"outer_step": real_step // local_steps,
})
# Logging after full gradient accumulation
t1 = time.time()
dt = t1 - t0
lr = [group['lr'] for group in inner_optimizer.param_groups][0]
tokens_processed = B * T * gradient_accumulation_steps * ddp_world_size
wandb.log({
"step": real_step,
"learning_rate": lr,
"train_loss": loss_accum.item(),
"tokens_per_sec": tokens_processed / dt,
"perplexity": torch.exp(loss_accum).item(),
})
loss_accum = 0.0 # Reset accumulator after logging
if ddp:
destroy_process_group()
0.00039991
1900.71045
1823
3725842.96399
7.54998
-
-
Crashed
-
kmfoda
8h 46m 17s
-
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import time
import math
import torch
import wandb
from contextlib import contextmanager, nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
AutoTokenizer,
LlamaForCausalLM,
get_cosine_schedule_with_warmup,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets.distributed import split_dataset_by_node
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast
from demo import DeMo
def get_grad_norm(parameters):
"""Helper function to compute gradient norm"""
total_norm = 0.0
for p in parameters:
if p.grad is not None:
param_norm = p.grad.detach().data.norm(2)
total_norm += param_norm.item() ** 2
return total_norm ** 0.5
def get_offloaded_param(outer_optimizer: torch.optim.Optimizer):
return [
param.data.detach().clone().to("cpu") for group in outer_optimizer.param_groups for param in group["params"]
]
# Initialize distributed training
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
else:
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.deterministic = False
# Training hyperparameters
MODEL_NAME = "PrimeIntellect/llama-150m-fresh"
RUN_NAME = "Open_DeMoCo_Copy_150m_AdamW+NoSignum_500Steps_andClip"
B = 16 # batch size per GPU
T = 1024 # sequence length
batch_size = 512 # total tokens across all GPUs
max_lr = 4e-4
min_lr = max_lr * 0.1
warmup_steps = 1000
max_steps = 88_000
local_steps = 500
assert batch_size % B == 0
gradient_accumulation_steps = batch_size // B
wandb.init(
project="DeMoCo",
entity="kmfoda",
name=RUN_NAME,
config={
"code": code # Add the code to the config
}
)
# Load model and tokenizer
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(device)
# model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K", use_fast=True)
tokenizer.pad_token = "</s>"
# Load and prepare dataset
ds = load_dataset(
"allenai/c4",
"en",
streaming=True,
data_files={
"train": "en/c4-train.*.json.gz",
"validation": "en/c4-validation.00000-of-00008.json.gz",
},
)
def tokenize_function(data):
return tokenizer(data["text"], truncation=True, max_length=T)
tokenized_datasets = ds.map(
tokenize_function,
batched=True,
remove_columns=["text", "timestamp", "url"],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = split_dataset_by_node(tokenized_datasets["train"], world_size=ddp_world_size, rank=ddp_rank)
train_dataloader = DataLoader(
train_dataset,
collate_fn=data_collator,
batch_size=B,
pin_memory=True,
num_workers=4,
prefetch_factor=2,
)
val_dataset = tokenized_datasets["validation"]
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=B)
# Initialize optimizers
inner_optimizer = torch.optim.AdamW(model.parameters(),
lr=max_lr,
weight_decay=0.1,
betas=(0.9, 0.95),
fused=True
)
# TODO REMOVE SIGNUN + CHECK OTHER SGD PARAMS
outer_optimizer = DeMo(
model.parameters(),
lr=0.7,
compression_decay=0.999, # Default values, adjust as needed
compression_topk=32,
compression_chunk=64,
process_group=None, # Will be set automatically by DDP
)
scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
for param in model.parameters():
# this make sure all device have the same weight init
dist.broadcast(param.data, src=0)
params_offloaded = get_offloaded_param(outer_optimizer)
model.train()
loss_accum = 0.0 # Move outside the loop to accumulate over all grad steps
for step, batch in enumerate(train_dataloader):
t0 = time.time()
real_step = (step + 1) // gradient_accumulation_steps
step_within_grad_acc = (step + 1) % gradient_accumulation_steps
batch = {k: v.to(device) for k, v in batch.items()}
if ddp:
with model.no_sync():
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
if master_process:
print(f"Outer Step: {real_step // local_steps} - Inner Step: {step} - Loss: {outputs.loss}")
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
else:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
if step_within_grad_acc == 0: # Only update after accumulating all gradients
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
inner_optimizer.step()
scheduler.step()
inner_optimizer.zero_grad(set_to_none=True)
# Outer optimization step every local_steps
if real_step % local_steps == 0 and real_step != 0:
if master_process:
print(f"perform outer step at step {real_step}")
main_param = [param for group in inner_optimizer.param_groups for param in group["params"]]
for param_offloaded, param in zip(params_offloaded, main_param):
param_offloaded_on_device = param_offloaded.data.to(param.device)
param.grad = param_offloaded_on_device - param.data
# dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG)
param.data = param_offloaded_on_device
# here we don't call scaler.step. Indeed the scaler has already done his work (scaling down the gradients) with the optimizer.step call
outer_optimizer.step()
outer_optimizer.zero_grad()
params_offloaded = get_offloaded_param(outer_optimizer)
if master_process:
# Log gradient norms after outer step
grad_norm_after = get_grad_norm(model.parameters())
print(f"Gradient norm after outer step: {grad_norm_after:.6f}")
wandb.log({
"grad_norm_after_outer": grad_norm_after,
"outer_step": real_step // local_steps,
})
# Logging after full gradient accumulation
t1 = time.time()
dt = t1 - t0
lr = [group['lr'] for group in inner_optimizer.param_groups][0]
tokens_processed = B * T * gradient_accumulation_steps * ddp_world_size
wandb.log({
"step": real_step,
"learning_rate": lr,
"train_loss": loss_accum.item(),
"tokens_per_sec": tokens_processed / dt,
"perplexity": torch.exp(loss_accum).item(),
})
loss_accum = 0.0 # Reset accumulator after logging
if ddp:
destroy_process_group()
0.00039991
1963.3678
1822
3715507.73938
7.58242
0
3
Crashed
-
kmfoda
8h 46m 34s
-
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import time
import math
import torch
import wandb
from contextlib import contextmanager, nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
AutoTokenizer,
LlamaForCausalLM,
get_cosine_schedule_with_warmup,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets.distributed import split_dataset_by_node
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast
from demo import DeMo
def get_grad_norm(parameters):
"""Helper function to compute gradient norm"""
total_norm = 0.0
for p in parameters:
if p.grad is not None:
param_norm = p.grad.detach().data.norm(2)
total_norm += param_norm.item() ** 2
return total_norm ** 0.5
def get_offloaded_param(outer_optimizer: torch.optim.Optimizer):
return [
param.data.detach().clone().to("cpu") for group in outer_optimizer.param_groups for param in group["params"]
]
# Initialize distributed training
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
else:
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.deterministic = False
# Training hyperparameters
MODEL_NAME = "PrimeIntellect/llama-150m-fresh"
RUN_NAME = "Open_DeMoCo_Copy_150m_AdamW+NoSignum_500Steps_andClip"
B = 16 # batch size per GPU
T = 1024 # sequence length
batch_size = 512 # total tokens across all GPUs
max_lr = 4e-4
min_lr = max_lr * 0.1
warmup_steps = 1000
max_steps = 88_000
local_steps = 500
assert batch_size % B == 0
gradient_accumulation_steps = batch_size // B
wandb.init(
project="DeMoCo",
entity="kmfoda",
name=RUN_NAME,
config={
"code": code # Add the code to the config
}
)
# Load model and tokenizer
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(device)
# model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K", use_fast=True)
tokenizer.pad_token = "</s>"
# Load and prepare dataset
ds = load_dataset(
"allenai/c4",
"en",
streaming=True,
data_files={
"train": "en/c4-train.*.json.gz",
"validation": "en/c4-validation.00000-of-00008.json.gz",
},
)
def tokenize_function(data):
return tokenizer(data["text"], truncation=True, max_length=T)
tokenized_datasets = ds.map(
tokenize_function,
batched=True,
remove_columns=["text", "timestamp", "url"],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = split_dataset_by_node(tokenized_datasets["train"], world_size=ddp_world_size, rank=ddp_rank)
train_dataloader = DataLoader(
train_dataset,
collate_fn=data_collator,
batch_size=B,
pin_memory=True,
num_workers=4,
prefetch_factor=2,
)
val_dataset = tokenized_datasets["validation"]
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=B)
# Initialize optimizers
inner_optimizer = torch.optim.AdamW(model.parameters(),
lr=max_lr,
weight_decay=0.1,
betas=(0.9, 0.95),
fused=True
)
# TODO REMOVE SIGNUN + CHECK OTHER SGD PARAMS
outer_optimizer = DeMo(
model.parameters(),
lr=0.7,
compression_decay=0.999, # Default values, adjust as needed
compression_topk=32,
compression_chunk=64,
process_group=None, # Will be set automatically by DDP
)
scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
for param in model.parameters():
# this make sure all device have the same weight init
dist.broadcast(param.data, src=0)
params_offloaded = get_offloaded_param(outer_optimizer)
model.train()
loss_accum = 0.0 # Move outside the loop to accumulate over all grad steps
for step, batch in enumerate(train_dataloader):
t0 = time.time()
real_step = (step + 1) // gradient_accumulation_steps
step_within_grad_acc = (step + 1) % gradient_accumulation_steps
batch = {k: v.to(device) for k, v in batch.items()}
if ddp:
with model.no_sync():
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
if master_process:
print(f"Outer Step: {real_step // local_steps} - Inner Step: {step} - Loss: {outputs.loss}")
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
else:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
if step_within_grad_acc == 0: # Only update after accumulating all gradients
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
inner_optimizer.step()
scheduler.step()
inner_optimizer.zero_grad(set_to_none=True)
# Outer optimization step every local_steps
if real_step % local_steps == 0 and real_step != 0:
if master_process:
print(f"perform outer step at step {real_step}")
main_param = [param for group in inner_optimizer.param_groups for param in group["params"]]
for param_offloaded, param in zip(params_offloaded, main_param):
param_offloaded_on_device = param_offloaded.data.to(param.device)
param.grad = param_offloaded_on_device - param.data
# dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG)
param.data = param_offloaded_on_device
# here we don't call scaler.step. Indeed the scaler has already done his work (scaling down the gradients) with the optimizer.step call
outer_optimizer.step()
outer_optimizer.zero_grad()
params_offloaded = get_offloaded_param(outer_optimizer)
if master_process:
# Log gradient norms after outer step
grad_norm_after = get_grad_norm(model.parameters())
print(f"Gradient norm after outer step: {grad_norm_after:.6f}")
wandb.log({
"grad_norm_after_outer": grad_norm_after,
"outer_step": real_step // local_steps,
})
# Logging after full gradient accumulation
t1 = time.time()
dt = t1 - t0
lr = [group['lr'] for group in inner_optimizer.param_groups][0]
tokens_processed = B * T * gradient_accumulation_steps * ddp_world_size
wandb.log({
"step": real_step,
"learning_rate": lr,
"train_loss": loss_accum.item(),
"tokens_per_sec": tokens_processed / dt,
"perplexity": torch.exp(loss_accum).item(),
})
loss_accum = 0.0 # Reset accumulator after logging
if ddp:
destroy_process_group()
0.00039991
2055.90796
1822
3712208.55597
7.62847
-
-
Crashed
-
kmfoda
8h 46m 34s
-
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import time
import math
import torch
import wandb
from contextlib import contextmanager, nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
AutoTokenizer,
LlamaForCausalLM,
get_cosine_schedule_with_warmup,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets.distributed import split_dataset_by_node
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast
from demo import DeMo
def get_grad_norm(parameters):
"""Helper function to compute gradient norm"""
total_norm = 0.0
for p in parameters:
if p.grad is not None:
param_norm = p.grad.detach().data.norm(2)
total_norm += param_norm.item() ** 2
return total_norm ** 0.5
def get_offloaded_param(outer_optimizer: torch.optim.Optimizer):
return [
param.data.detach().clone().to("cpu") for group in outer_optimizer.param_groups for param in group["params"]
]
# Initialize distributed training
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
else:
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.deterministic = False
# Training hyperparameters
MODEL_NAME = "PrimeIntellect/llama-150m-fresh"
RUN_NAME = "Open_DeMoCo_Copy_150m_AdamW+NoSignum_500Steps_andClip"
B = 16 # batch size per GPU
T = 1024 # sequence length
batch_size = 512 # total tokens across all GPUs
max_lr = 4e-4
min_lr = max_lr * 0.1
warmup_steps = 1000
max_steps = 88_000
local_steps = 500
assert batch_size % B == 0
gradient_accumulation_steps = batch_size // B
wandb.init(
project="DeMoCo",
entity="kmfoda",
name=RUN_NAME,
config={
"code": code # Add the code to the config
}
)
# Load model and tokenizer
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(device)
# model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K", use_fast=True)
tokenizer.pad_token = "</s>"
# Load and prepare dataset
ds = load_dataset(
"allenai/c4",
"en",
streaming=True,
data_files={
"train": "en/c4-train.*.json.gz",
"validation": "en/c4-validation.00000-of-00008.json.gz",
},
)
def tokenize_function(data):
return tokenizer(data["text"], truncation=True, max_length=T)
tokenized_datasets = ds.map(
tokenize_function,
batched=True,
remove_columns=["text", "timestamp", "url"],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = split_dataset_by_node(tokenized_datasets["train"], world_size=ddp_world_size, rank=ddp_rank)
train_dataloader = DataLoader(
train_dataset,
collate_fn=data_collator,
batch_size=B,
pin_memory=True,
num_workers=4,
prefetch_factor=2,
)
val_dataset = tokenized_datasets["validation"]
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=B)
# Initialize optimizers
inner_optimizer = torch.optim.AdamW(model.parameters(),
lr=max_lr,
weight_decay=0.1,
betas=(0.9, 0.95),
fused=True
)
# TODO REMOVE SIGNUN + CHECK OTHER SGD PARAMS
outer_optimizer = DeMo(
model.parameters(),
lr=0.7,
compression_decay=0.999, # Default values, adjust as needed
compression_topk=32,
compression_chunk=64,
process_group=None, # Will be set automatically by DDP
)
scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
for param in model.parameters():
# this make sure all device have the same weight init
dist.broadcast(param.data, src=0)
params_offloaded = get_offloaded_param(outer_optimizer)
model.train()
loss_accum = 0.0 # Move outside the loop to accumulate over all grad steps
for step, batch in enumerate(train_dataloader):
t0 = time.time()
real_step = (step + 1) // gradient_accumulation_steps
step_within_grad_acc = (step + 1) % gradient_accumulation_steps
batch = {k: v.to(device) for k, v in batch.items()}
if ddp:
with model.no_sync():
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
if master_process:
print(f"Outer Step: {real_step // local_steps} - Inner Step: {step} - Loss: {outputs.loss}")
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
else:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
if step_within_grad_acc == 0: # Only update after accumulating all gradients
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
inner_optimizer.step()
scheduler.step()
inner_optimizer.zero_grad(set_to_none=True)
# Outer optimization step every local_steps
if real_step % local_steps == 0 and real_step != 0:
if master_process:
print(f"perform outer step at step {real_step}")
main_param = [param for group in inner_optimizer.param_groups for param in group["params"]]
for param_offloaded, param in zip(params_offloaded, main_param):
param_offloaded_on_device = param_offloaded.data.to(param.device)
param.grad = param_offloaded_on_device - param.data
# dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG)
param.data = param_offloaded_on_device
# here we don't call scaler.step. Indeed the scaler has already done his work (scaling down the gradients) with the optimizer.step call
outer_optimizer.step()
outer_optimizer.zero_grad()
params_offloaded = get_offloaded_param(outer_optimizer)
if master_process:
# Log gradient norms after outer step
grad_norm_after = get_grad_norm(model.parameters())
print(f"Gradient norm after outer step: {grad_norm_after:.6f}")
wandb.log({
"grad_norm_after_outer": grad_norm_after,
"outer_step": real_step // local_steps,
})
# Logging after full gradient accumulation
t1 = time.time()
dt = t1 - t0
lr = [group['lr'] for group in inner_optimizer.param_groups][0]
tokens_processed = B * T * gradient_accumulation_steps * ddp_world_size
wandb.log({
"step": real_step,
"learning_rate": lr,
"train_loss": loss_accum.item(),
"tokens_per_sec": tokens_processed / dt,
"perplexity": torch.exp(loss_accum).item(),
})
loss_accum = 0.0 # Reset accumulator after logging
if ddp:
destroy_process_group()
0.00039991
1877.95032
1822
3725899.7797
7.53794
-
-
Failed
-
kmfoda
8d 7h 28m 41s
-
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import time
import math
import torch
import wandb
from contextlib import contextmanager, nullcontext
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group
from transformers import (
AutoTokenizer,
LlamaForCausalLM,
get_cosine_schedule_with_warmup,
DataCollatorForLanguageModeling,
)
from datasets import load_dataset
from torch.utils.data import DataLoader
from datasets.distributed import split_dataset_by_node
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.cuda.amp import autocast
from demo import DeMo
def get_grad_norm(parameters):
"""Helper function to compute gradient norm"""
total_norm = 0.0
for p in parameters:
if p.grad is not None:
param_norm = p.grad.detach().data.norm(2)
total_norm += param_norm.item() ** 2
return total_norm ** 0.5
def get_offloaded_param(outer_optimizer: torch.optim.Optimizer):
return [
param.data.detach().clone().to("cpu") for group in outer_optimizer.param_groups for param in group["params"]
]
# Initialize distributed training
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
else:
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.deterministic = False
# Training hyperparameters
MODEL_NAME = "PrimeIntellect/llama-150m-fresh"
RUN_NAME = "Open_DeMoCo_Copy_150m_AdamW+NoSignum_500Steps_andClip"
B = 32 # batch size per GPU
T = 1024 # sequence length
batch_size = 512 # total tokens across all GPUs
max_lr = 4e-4
min_lr = max_lr * 0.1
warmup_steps = 1000
max_steps = 88_000
local_steps = 88_000
assert batch_size % B == 0
gradient_accumulation_steps = batch_size // B
wandb.init(
project="DeMoCo",
entity="kmfoda",
name=RUN_NAME,
config={
"code": code # Add the code to the config
}
)
# Load model and tokenizer
model = LlamaForCausalLM.from_pretrained(MODEL_NAME).to(device)
# model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K", use_fast=True)
tokenizer.pad_token = "</s>"
# Load and prepare dataset
ds = load_dataset(
"allenai/c4",
"en",
streaming=True,
data_files={
"train": "en/c4-train.*.json.gz",
"validation": "en/c4-validation.00000-of-00008.json.gz",
},
)
def tokenize_function(data):
return tokenizer(data["text"], truncation=True, max_length=T)
tokenized_datasets = ds.map(
tokenize_function,
batched=True,
remove_columns=["text", "timestamp", "url"],
)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataset = split_dataset_by_node(tokenized_datasets["train"], world_size=ddp_world_size, rank=ddp_rank)
train_dataloader = DataLoader(
train_dataset,
collate_fn=data_collator,
batch_size=B,
pin_memory=True,
num_workers=4,
prefetch_factor=2,
)
val_dataset = tokenized_datasets["validation"]
val_dataloader = DataLoader(val_dataset, collate_fn=data_collator, batch_size=B)
# Initialize optimizers
inner_optimizer = torch.optim.AdamW(model.parameters(),
lr=max_lr,
weight_decay=0.1,
betas=(0.9, 0.95),
fused=True
)
# TODO REMOVE SIGNUN + CHECK OTHER SGD PARAMS
outer_optimizer = DeMo(
model.parameters(),
lr=0.7,
compression_decay=0.999, # Default values, adjust as needed
compression_topk=32,
compression_chunk=64,
process_group=None, # Will be set automatically by DDP
)
scheduler = get_cosine_schedule_with_warmup(
inner_optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
for param in model.parameters():
# this make sure all device have the same weight init
dist.broadcast(param.data, src=0)
params_offloaded = get_offloaded_param(outer_optimizer)
model.train()
loss_accum = 0.0 # Move outside the loop to accumulate over all grad steps
for step, batch in enumerate(train_dataloader):
t0 = time.time()
real_step = (step + 1) // gradient_accumulation_steps
step_within_grad_acc = (step + 1) % gradient_accumulation_steps
batch = {k: v.to(device) for k, v in batch.items()}
if ddp:
with model.no_sync():
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
if master_process:
print(f"Outer Step: {real_step} - Inner Step: {step} - Loss: {outputs.loss}")
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
else:
with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
loss_accum += loss.detach()
loss.backward()
if step_within_grad_acc == 0: # Only update after accumulating all gradients
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
inner_optimizer.step()
scheduler.step()
inner_optimizer.zero_grad(set_to_none=True)
# Outer optimization step every local_steps
if real_step % local_steps == 0 and real_step != 0:
if local_rank == 0:
print(f"perform outer step at step {real_step}")
main_param = [param for group in inner_optimizer.param_groups for param in group["params"]]
for param_offloaded, param in zip(params_offloaded, main_param):
param_offloaded_on_device = param_offloaded.data.to(param.device)
param.grad = param_offloaded_on_device - param.data
dist.all_reduce(tensor=param.grad, op=dist.ReduceOp.AVG)
param.data = param_offloaded_on_device
# here we don't call scaler.step. Indeed the scaler has already done his work (scaling down the gradients) with the optimizer.step call
outer_optimizer.step()
outer_optimizer.zero_grad()
params_offloaded = get_offloaded_param(outer_optimizer)
if master_process:
# Log gradient norms after outer step
grad_norm_after = get_grad_norm(model.parameters())
print(f"Gradient norm after outer step: {grad_norm_after:.6f}")
wandb.log({
"grad_norm_before_outer": grad_norm_before,
"grad_norm_after_outer": grad_norm_after,
"grad_norm_ratio": grad_norm_after / (grad_norm_before + 1e-8),
"outer_step": real_step // local_steps,
})
# Logging after full gradient accumulation
t1 = time.time()
dt = t1 - t0
lr = [group['lr'] for group in inner_optimizer.param_groups][0]
tokens_processed = B * T * gradient_accumulation_steps * ddp_world_size
wandb.log({
"step": real_step,
"learning_rate": lr,
"train_loss": loss_accum.item(),
"tokens_per_sec": tokens_processed / dt,
"perplexity": torch.exp(loss_accum).item(),
})
loss_accum = 0.0 # Reset accumulator after logging
if ddp:
destroy_process_group()
1.3040e-13
14.14279
87999
1016947.20928
2.6492
-
-
1-5
of 5