Skip to main content

Finetune Project for Text to SPARQL

This project is part of my finetuning course from Weights and biases, where I learned the basics of fine-tuning. The aim of the project is to fine-tune a t5-base and t5-small on the CFQ dataset that has natural language to SPARQL queries. It is kind of a translation problem. We used LoRA from the PEFT library to do fine-tuning. The code is attached below. As it is a translation task, we used rouge metric. For t5-base, we got 7.89 and for t5-small we got 6.23. The t5-base gave better performance.
Created on November 27|Last edited on November 27

Section 1


This set of panels contains runs from a private project, which cannot be shown in this report

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from datasets import load_dataset
cfq_data = load_dataset("cfq","mcd1")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained("t5-base")

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model = AutoModelForSeq2SeqLM.from_pretrained(
"t5-base",
max_memory=max_memory,
# quantization_config=BitsAndBytesConfig(
# load_in_4bit=True,
# llm_int8_threshold=6.0,
# llm_int8_has_fp16_weight=False,
# bnb_4bit_compute_dtype=torch.float16,
# bnb_4bit_use_double_quant=True,
# bnb_4bit_quant_type="nf4",
# ),
# torch_dtype=torch.float16,
load_in_8bit=False,
device_map='auto'
)

from torch import nn
for param in model.parameters():
param.requires_grad = False # freeze the model - train adapters later
if param.ndim == 1:
# cast the small parameters (e.g. layernorm) to fp32 for stability
param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable() # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

config = LoraConfig(
r = 32,
lora_alpha = 64,
target_modules = ["q","v","o"],
lora_dropout = 0.05, bias="all",
# task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
print_trainable_parameters(model)

prefix = "translate english to sparql: "
max_input_length = 512
max_target_length = 256
def preprocess_function(examples):
inputs = []
targets= []
for question,query in zip(examples['question'],examples['query']):
inputs.append(prefix + question)
targets.append(query)
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

# Setup the tokenizer for targets
# with tokenizer.as_target_tokenizer():
labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)

model_inputs["labels"] = labels["input_ids"]
return model_inputs

import nltk
import numpy as np
import evaluate

metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Rouge expects a newline after each sentence
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

# Note that other metrics may not have a `use_aggregator` parameter
# and thus will return a list, computing a metric for each sentence.
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
# Extract a few results
result = {key: value * 100 for key, value in result.items()}

# Add mean generated length
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
result["gen_len"] = np.mean(prediction_lens)

return {k: round(v, 4) for k, v in result.items()}

import wandb
wandb.login()
import os
os.environ["WANDB_PROJECT"]="text_to_sparql_finetuning"
os.environ['WANDB_ENTITY'] = 'athe_kunal'
os.environ["WANDB_LOG_MODEL"]="checkpoint"

trainer = Seq2SeqTrainer(
model=model,
train_dataset=cfq_tokenized_data['train'],
args=Seq2SeqTrainingArguments(
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
# warmup_steps=100,
max_steps=200,
per_gpu_eval_batch_size=8,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
output_dir='outputs',
report_to='wandb',
run_name='finetune-t5-base',
do_eval=True,
remove_unused_columns=False,
evaluation_strategy='epoch',
num_train_epochs=1.0
),
eval_dataset=cfq_tokenized_data['test'],
data_collator=DataCollatorForSeq2Seq(tokenizer),
compute_metrics=compute_metrics
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!

trainer.train()
Challenges faced were regarding the memory management for t5-base, as without LoRA it got OOM. Future projects would be to explore QloRA and Deepspeed to speed up training for multi-gpu