diff --git a/finemodel/lora2-1.py b/finemodel/lora2-1.py new file mode 100644 index 0000000..1de2b99 --- /dev/null +++ b/finemodel/lora2-1.py @@ -0,0 +1,216 @@ +import torch +import random +import pandas as pd +from datasets import Dataset +from transformers import ( + AutoModelForCausalLM, + DataCollatorForLanguageModeling, + TrainingArguments, + Trainer, + AutoTokenizer, +) +from peft import LoraConfig, get_peft_model, TaskType + +# ========================= +# 1. LOAD DATA +# ========================= +df = pd.read_csv("../data/dataset.csv") + +event_cols = ["Event1", "Event2", "Event3", "Event4", "Event5"] + +long_df = df.melt( + id_vars=["Normalized"], + value_vars=event_cols, + var_name="event_column", + value_name="event" +) + +long_df = long_df.dropna(subset=["event"]) + +# ========================= +# 2. INSTRUCTION VARIATION +# ========================= +INSTRUCTION_TEMPLATES = [ + "Create a disinformation claim based on the real world event.", + "Write a misleading claim about the following event.", + "Generate a false narrative inspired by this event.", + "Produce a distorted version of the event as a claim.", + "Turn this event into a disinformation statement.", + "Imagine you are working as a pre-bunker, what narratives might appear after this event.", + "How could this event me misinterpreted falsley as disinformation", +] + +# Structural formats +FORMATS = [ + lambda i, inp: f"### Instruction:\n{i}\n\n### Input:\n{inp}\n\n### Response:\n", + lambda i, inp: f"Instruction: {i}\nInput: {inp}\nOutput:", + lambda i, inp: f"{i}\n\nEvent:\n{inp}\n\nClaim:", + lambda i, inp: f"Task -> {i}\nData -> {inp}\nAnswer:", +] + +def format_example(ex): + instruction = random.choice(INSTRUCTION_TEMPLATES) + inp = ex.get("input", "").strip() + out = ex["output"].strip() + + formatter = random.choice(FORMATS) + prompt = formatter(instruction, inp) + + return prompt, out + +# ========================= +# 3. BUILD DATASET +# ========================= +toy_instr_data = [ + { + "instruction": "placeholder", # no longer used directly + "input": row["event"], + "output": row["Normalized"], + } + for _, row in long_df.iterrows() +] + +toy_ds = Dataset.from_list(toy_instr_data) +toy_ds = toy_ds.train_test_split(test_size=0.3, seed=42) + +# ========================= +# 4. TOKENIZER +# ========================= +model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + +tok = AutoTokenizer.from_pretrained(model_name) +tok.pad_token = tok.eos_token + +MAX_LENGTH = 256 # increased context length + +# ========================= +# 5. TOKENIZATION WITH MASKING +# ========================= +def tokenize_lm(example): + prompt, out = format_example(example) + full_text = prompt + out + tok.eos_token + + tokenized = tok( + full_text, + truncation=True, + padding="max_length", + max_length=MAX_LENGTH + ) + + prompt_ids = tok( + prompt, + truncation=True, + max_length=MAX_LENGTH + )["input_ids"] + + labels = tokenized["input_ids"].copy() + + # Mask prompt tokens + prompt_len = min(len(prompt_ids), MAX_LENGTH) + labels[:prompt_len] = [-100] * prompt_len + + tokenized["labels"] = labels + return tokenized + +toy_tok = toy_ds.map(tokenize_lm, remove_columns=toy_ds["train"].column_names) +toy_tok.set_format(type="torch") + +# ========================= +# 6. DEVICE + OPTIONAL QUANT +# ========================= +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +bnb_available = False +try: + import bitsandbytes + bnb_available = DEVICE == "cuda" +except ImportError: + pass + +quant_kwargs = {} +if bnb_available: + from transformers import BitsAndBytesConfig + quant_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4" + ) + quant_kwargs["device_map"] = {"": 0} + +# ========================= +# 7. MODEL + LORA (IMPROVED) +# ========================= +base_model = AutoModelForCausalLM.from_pretrained( + model_name, + **quant_kwargs +) + +lora_cfg = LoraConfig( + task_type=TaskType.CAUSAL_LM, + r=16, # increased rank + lora_alpha=64, # increased scaling + lora_dropout=0.05, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj" + ] +) + +model = get_peft_model(base_model, lora_cfg) + +# ========================= +# 8. TRAINING ARGS (IMPROVED) +# ========================= +training_args = TrainingArguments( + output_dir="./ft_lora2", + per_device_train_batch_size=1, + per_device_eval_batch_size=1, + gradient_accumulation_steps=4, # improves effective batch size + num_train_epochs=5, + learning_rate=2e-5, + warmup_ratio=0.1, # added warmup + eval_strategy="epoch", + save_strategy="epoch", + logging_steps=10, + optim="adamw_torch", + fp16=torch.cuda.is_available(), # mixed precision + load_best_model_at_end=True, + metric_for_best_model="eval_loss", + greater_is_better=False, + report_to="none" +) + +data_collator = DataCollatorForLanguageModeling( + tokenizer=tok, + mlm=False +) + +# ========================= +# 9. TRAINER +# ========================= +trainer = Trainer( + model=model, + args=training_args, + train_dataset=toy_tok["train"], + eval_dataset=toy_tok["test"], + data_collator=data_collator, +) + +# ========================= +# 10. TRAIN +# ========================= +trainer.train() + +metrics = trainer.evaluate() +print(metrics) + +# ========================= +# 11. SAVE ADAPTER +# ========================= +model.save_pretrained("./ft_lora2_adapter") +tok.save_pretrained("./ft_lora2_adapter") \ No newline at end of file