Finetune Project for Text to SPARQL

This project is part of my finetuning course from Weights and biases, where I learned the basics of fine-tuning. The aim of the project is to fine-tune a t5-base and t5-small on the CFQ dataset that has natural language to SPARQL queries. It is kind of a translation problem. We used LoRA from the PEFT library to do fine-tuning. The code is attached below. As it is a translation task, we used rouge metric. For t5-base, we got 7.89 and for t5-small we got 6.23. The t5-base gave better performance.

Astarag Mohapatra

Created on November 27|Last edited on November 27

Comment

﻿
Section 1﻿
This set of panels contains runs from a private project, which cannot be shown in this report
﻿
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
﻿
from datasets import load_dataset
cfq_data = load_dataset("cfq","mcd1")
﻿
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
﻿
tokenizer = AutoTokenizer.from_pretrained("t5-base")
﻿
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
﻿
model = AutoModelForSeq2SeqLM.from_pretrained(
    "t5-base",
    max_memory=max_memory,
    # quantization_config=BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     llm_int8_threshold=6.0,
    #     llm_int8_has_fp16_weight=False,
    #     bnb_4bit_compute_dtype=torch.float16,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    # ),
    # torch_dtype=torch.float16,
    load_in_8bit=False,
    device_map='auto'
)
﻿
from torch import nn
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)
﻿
model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()
﻿
class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)
﻿
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
﻿
config = LoraConfig(
      r = 32,
      lora_alpha = 64,
      target_modules = ["q","v","o"],
      lora_dropout = 0.05, bias="all",
      # task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
﻿
prefix = "translate english to sparql: "
max_input_length = 512
max_target_length = 256
def preprocess_function(examples):
    inputs = []
    targets= []
    for question,query in zip(examples['question'],examples['query']):
      inputs.append(prefix + question)
      targets.append(query)
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
﻿
    # Setup the tokenizer for targets
    # with tokenizer.as_target_tokenizer():
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
﻿
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
﻿
import nltk
import numpy as np
import evaluate
﻿
metric = evaluate.load("rouge")
﻿
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
﻿
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
﻿
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
﻿
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
﻿
    return {k: round(v, 4) for k, v in result.items()}
﻿
import wandb
wandb.login()
import os
os.environ["WANDB_PROJECT"]="text_to_sparql_finetuning"
os.environ['WANDB_ENTITY'] = 'athe_kunal'
os.environ["WANDB_LOG_MODEL"]="checkpoint"
﻿
trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=cfq_tokenized_data['train'],
    args=Seq2SeqTrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        # warmup_steps=100,
        max_steps=200,
        per_gpu_eval_batch_size=8,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
        report_to='wandb',
        run_name='finetune-t5-base',
        do_eval=True,
        remove_unused_columns=False,
        evaluation_strategy='epoch',
        num_train_epochs=1.0
    ),
    eval_dataset=cfq_tokenized_data['test'],
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    compute_metrics=compute_metrics
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
﻿
trainer.train()
Challenges faced were regarding the memory management for t5-base, as without LoRA it got OOM. Future projects would be to explore QloRA and Deepspeed to speed up training for multi-gpu
﻿

Add a comment