Finetune Project for Text to SPARQL
This project is part of my finetuning course from Weights and biases, where I learned the basics of fine-tuning. The aim of the project is to fine-tune a t5-base and t5-small on the CFQ dataset that has natural language to SPARQL queries. It is kind of a translation problem. We used LoRA from the PEFT library to do fine-tuning. The code is attached below. As it is a translation task, we used rouge metric. For t5-base, we got 7.89 and for t5-small we got 6.23. The t5-base gave better performance.
Created on November 27|Last edited on November 27
Comment
Section 1
This set of panels contains runs from a private project, which cannot be shown in this report
import osos.environ["CUDA_VISIBLE_DEVICES"]="0"from datasets import load_datasetcfq_data = load_dataset("cfq","mcd1")from transformers import AutoTokenizer, AutoModelForSeq2SeqLMimport torchtokenizer = AutoTokenizer.from_pretrained("t5-base")from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfigmodel = AutoModelForSeq2SeqLM.from_pretrained("t5-base",max_memory=max_memory,# quantization_config=BitsAndBytesConfig(# load_in_4bit=True,# llm_int8_threshold=6.0,# llm_int8_has_fp16_weight=False,# bnb_4bit_compute_dtype=torch.float16,# bnb_4bit_use_double_quant=True,# bnb_4bit_quant_type="nf4",# ),# torch_dtype=torch.float16,load_in_8bit=False,device_map='auto')from torch import nnfor param in model.parameters():param.requires_grad = False # freeze the model - train adapters laterif param.ndim == 1:# cast the small parameters (e.g. layernorm) to fp32 for stabilityparam.data = param.data.to(torch.float32)model.gradient_checkpointing_enable() # reduce number of stored activationsmodel.enable_input_require_grads()class CastOutputToFloat(nn.Sequential):def forward(self, x): return super().forward(x).to(torch.float32)model.lm_head = CastOutputToFloat(model.lm_head)from peft import LoraConfig, get_peft_modelfrom transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainerconfig = LoraConfig(r = 32,lora_alpha = 64,target_modules = ["q","v","o"],lora_dropout = 0.05, bias="all",# task_type="CAUSAL_LM")model = get_peft_model(model, config)print_trainable_parameters(model)prefix = "translate english to sparql: "max_input_length = 512max_target_length = 256def preprocess_function(examples):inputs = []targets= []for question,query in zip(examples['question'],examples['query']):inputs.append(prefix + question)targets.append(query)model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)# Setup the tokenizer for targets# with tokenizer.as_target_tokenizer():labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)model_inputs["labels"] = labels["input_ids"]return model_inputsimport nltkimport numpy as npimport evaluatemetric = evaluate.load("rouge")def compute_metrics(eval_pred):predictions, labels = eval_preddecoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)# Replace -100 in the labels as we can't decode them.labels = np.where(labels != -100, labels, tokenizer.pad_token_id)decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)# Rouge expects a newline after each sentencedecoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]# Note that other metrics may not have a `use_aggregator` parameter# and thus will return a list, computing a metric for each sentence.result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)# Extract a few resultsresult = {key: value * 100 for key, value in result.items()}# Add mean generated lengthprediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]result["gen_len"] = np.mean(prediction_lens)return {k: round(v, 4) for k, v in result.items()}import wandbwandb.login()import osos.environ["WANDB_PROJECT"]="text_to_sparql_finetuning"os.environ['WANDB_ENTITY'] = 'athe_kunal'os.environ["WANDB_LOG_MODEL"]="checkpoint"trainer = Seq2SeqTrainer(model=model,train_dataset=cfq_tokenized_data['train'],args=Seq2SeqTrainingArguments(per_device_train_batch_size=4,gradient_accumulation_steps=4,# warmup_steps=100,max_steps=200,per_gpu_eval_batch_size=8,learning_rate=2e-4,fp16=True,logging_steps=1,output_dir='outputs',report_to='wandb',run_name='finetune-t5-base',do_eval=True,remove_unused_columns=False,evaluation_strategy='epoch',num_train_epochs=1.0),eval_dataset=cfq_tokenized_data['test'],data_collator=DataCollatorForSeq2Seq(tokenizer),compute_metrics=compute_metrics)model.config.use_cache = False # silence the warnings. Please re-enable for inference!trainer.train()
Challenges faced were regarding the memory management for t5-base, as without LoRA it got OOM. Future projects would be to explore QloRA and Deepspeed to speed up training for multi-gpu
Add a comment