Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| """Untitled4.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/19SAJcA_N4eQVyeNjT1iFdgpyLvvtSSEw | |
| """ | |
| !pip install transformers datasets accelerate -q | |
| from google.colab import files | |
| uploaded = files.upload() | |
| from datasets import Dataset | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments | |
| import pandas as pd | |
| import torch | |
| # Load CSV file (adjust filename if needed) | |
| df = pd.read_csv("flan_t5_true_false_dataset.csv") | |
| # Convert to Hugging Face Dataset | |
| dataset = Dataset.from_pandas(df) | |
| # Load tokenizer and model | |
| model_name = "google/flan-t5-base" | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| model = T5ForConditionalGeneration.from_pretrained(model_name) | |
| # Preprocessing | |
| def preprocess(example): | |
| inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=256) | |
| with tokenizer.as_target_tokenizer(): | |
| labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=64) | |
| inputs["labels"] = labels["input_ids"] | |
| return inputs | |
| # Tokenize dataset | |
| tokenized_dataset = dataset.map(preprocess, batched=True) | |
| # Define training arguments | |
| training_args = Seq2SeqTrainingArguments( | |
| output_dir="./flan_t5_finetuned_model", | |
| per_device_train_batch_size=4, | |
| per_device_eval_batch_size=4, # Added evaluation batch size | |
| num_train_epochs=3, | |
| save_steps=500, | |
| logging_steps=100, | |
| save_total_limit=1, | |
| fp16=torch.cuda.is_available() | |
| ) | |
| # Trainer setup | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=DataCollatorForSeq2Seq(tokenizer, model) | |
| ) | |
| # Start training | |
| trainer.train() | |
| !zip -r flan_t5_finetuned_model.zip flan_t5_finetuned_model | |
| files.download("flan_t5_finetuned_model.zip") | |
| import pandas as pd | |
| data = [ | |
| { | |
| "input": f"Convert this fact into a true/false question: The moon is made of cheese {i}.", | |
| "output": f"The moon is made of cheese {i}. True or False?" | |
| } | |
| for i in range(150) | |
| ] | |
| df = pd.DataFrame(data) | |
| df.to_csv("flan_t5_eval.csv", index=False) | |
| from google.colab import files | |
| files.download('flan_t5_eval.csv') | |
| !pip install transformers datasets bert-score sentence-transformers -q | |
| from google.colab import files | |
| uploaded = files.upload() | |
| EVAL_CSV = "/content/flan_t5_eval.csv" | |
| !ls -l ./flan_t5_finetuned | |