Spaces:
Runtime error
Runtime error
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from reward_model.reward_model import GPTRewardModel | |
| from transformers import AutoTokenizer | |
| import trlx | |
| from trlx.data.default_configs import ( | |
| ILQLConfig, | |
| ModelConfig, | |
| OptimizerConfig, | |
| SchedulerConfig, | |
| TokenizerConfig, | |
| TrainConfig, | |
| TRLConfig, | |
| ) | |
| default_config = TRLConfig( | |
| train=TrainConfig( | |
| seq_length=550, | |
| batch_size=8, | |
| epochs=100, | |
| total_steps=5000, | |
| checkpoint_interval=10000, | |
| eval_interval=1000, | |
| pipeline="PromptPipeline", | |
| trainer="AccelerateILQLTrainer", | |
| checkpoint_dir="ilql_summarize_t5", | |
| ), | |
| model=ModelConfig(model_path="pvduy/flant5-xl_openai_tldr_sft", num_layers_unfrozen=-1, model_arch_type="seq2seq"), | |
| tokenizer=TokenizerConfig(tokenizer_path="pvduy/flant5-xl_openai_tldr_sft", truncation_side="left"), | |
| optimizer=OptimizerConfig(name="adamw", kwargs=dict(lr=1e-6, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1.0e-6)), | |
| scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=5000, eta_min=1e-6)), | |
| method=ILQLConfig( | |
| name="ilqlconfig", | |
| tau=0.6, | |
| gamma=0.99, | |
| cql_scale=0.1, | |
| awac_scale=1, | |
| alpha=0.0001, | |
| beta=0, | |
| steps_for_target_q_sync=1, | |
| two_qs=True, | |
| gen_kwargs=dict(max_new_tokens=50, top_k=50, beta=[1, 2, 3], temperature=1.0), | |
| ), | |
| ) | |
| REWARD_CHECKPOINT_PATH = "reward_model/rm_checkpoint/pytorch_model.bin" | |
| if not os.path.exists(REWARD_CHECKPOINT_PATH): | |
| os.makedirs("reward_model/rm_checkpoint", exist_ok=True) | |
| os.system( | |
| f"wget -O {REWARD_CHECKPOINT_PATH} \ | |
| https://huggingface.co/CarperAI/openai_summarize_tldr_rm_checkpoint/resolve/main/pytorch_model.bin" | |
| ) | |
| SFT_MODEL_PATH = "CarperAI/openai_summarize_tldr_sft" | |
| def main(hparams={}): | |
| config = TRLConfig.update(default_config, hparams) | |
| rw_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") | |
| rw_tokenizer.pad_token = rw_tokenizer.eos_token | |
| rw_model = GPTRewardModel(SFT_MODEL_PATH) | |
| rw_model.load_state_dict(torch.load(REWARD_CHECKPOINT_PATH)) | |
| rw_model.half() | |
| rw_model.eval() | |
| rw_device = torch.device("cuda:{}".format(1)) # set reward model device | |
| rw_model.to(rw_device) | |
| def reward_fn(samples): | |
| scores_list = [] | |
| batch_size = 2 | |
| for i in range(0, len(samples), batch_size): | |
| sub_samples = samples[i : i + batch_size] | |
| sub_samples = ["<|startoftext|>" + chosen + "<|endoftext|>" for chosen in sub_samples] | |
| encodings_dict = rw_tokenizer( | |
| sub_samples, | |
| truncation=True, | |
| max_length=config.train.seq_length, | |
| padding="max_length", | |
| return_tensors="pt", | |
| ) | |
| input_ids = encodings_dict["input_ids"].to(rw_device) | |
| attn_masks = encodings_dict["attention_mask"].to(rw_device) | |
| input_ids = input_ids.repeat(2, 1) | |
| attn_masks = attn_masks.repeat(2, 1) | |
| with torch.no_grad(): | |
| sub_scores = rw_model(input_ids=input_ids, attention_mask=attn_masks) | |
| scores_list.append(sub_scores["chosen_end_scores"]) | |
| scores = torch.cat(scores_list, dim=0) | |
| return scores | |
| def preprocess(sample): | |
| sample["prompt_output"] = [ | |
| [sample["prompt"] + " TL;DR:", sample["chosen"][7:]], | |
| [sample["prompt"] + " TL;DR:", sample["rejected"][7:]], | |
| ] | |
| sample["reward"] = [1, -1] | |
| return sample | |
| dataset = load_dataset("CarperAI/openai_summarize_comparisons") | |
| dataset["train"] = dataset["train"] | |
| dataset = dataset.map(preprocess) | |
| prompts_outputs = sum(dataset["train"]["prompt_output"], []) | |
| rewards = sum(dataset["train"]["reward"], []) | |
| val_dataset = load_dataset("CarperAI/openai_summarize_tldr", split="valid") | |
| eval_prompts = list(val_dataset["prompt"])[:1000] | |
| trlx.train( | |
| dataset=(prompts_outputs, rewards), | |
| metric_fn=lambda samples, **kwargs: {"rewards": reward_fn(samples)}, | |
| eval_prompts=eval_prompts, | |
| config=config, | |
| ) | |
| if __name__ == "__main__": | |
| import json | |
| import sys | |
| hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1]) | |
| main(hparams) | |