diff --git a/finemodel/.gitignore b/finemodel/.gitignore new file mode 100644 index 0000000..bf587a9 --- /dev/null +++ b/finemodel/.gitignore @@ -0,0 +1 @@ +ft_*/ \ No newline at end of file diff --git a/finemodel/lora2.py b/finemodel/lora2.py new file mode 100644 index 0000000..b904318 --- /dev/null +++ b/finemodel/lora2.py @@ -0,0 +1,123 @@ +# https://github.com/karimiannima/LLM-Fine-Tuning-Step-by-Step-Tutorial/blob/main/LLM_Fine_Tuning_Tutorial.ipynb +import torch +from peft import LoraConfig, get_peft_model, TaskType +from datasets import Dataset +from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, AutoTokenizer +import pandas as pd + +# Load your CSV +df = pd.read_csv("../data/dataset-dev.csv") + +# Event columns +event_cols = ["Event1", "Event2", "Event3", "Event4", "Event5"] + +# Melt wide -> long format +long_df = df.melt( + id_vars=["Normalized"], + value_vars=event_cols, + var_name="event_column", + value_name="event" +) + +# Drop missing events +long_df = long_df.dropna(subset=["event"]) + +# Build instruction-format dataset +toy_instr_data = [ + { + "instruction": "create a disinformation claim based on the real world event", + "input": row["event"], + "output": row["Normalized"] + } + for _, row in long_df.iterrows() +] + +# Example: print first few +print(toy_instr_data[:3]) + +tok_gpt = AutoTokenizer.from_pretrained("distilgpt2") +tok_gpt.pad_token = tok_gpt.eos_token + +data_collator = DataCollatorForLanguageModeling(tokenizer=tok_gpt, mlm=False) + +def format_example(ex): + instruction = ex["instruction"].strip() + inp = ex.get("input", "").strip() + out = ex["output"].strip() + if inp: + prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n" + else: + prompt = f"### Instruction:\n{instruction}\n\n### Response:\n" + return prompt, out + +def build_text(example): + prompt, out = format_example(example) + return {"text": prompt + out + tok_gpt.eos_token} # assumes tok_gpt defined earlier + +toy_ds = Dataset.from_list(toy_instr_data).map(build_text) +toy_ds = toy_ds.train_test_split(test_size=0.3, seed=42) + +def tokenize_lm(batch): + return tok_gpt(batch["text"], truncation=True, padding="max_length", max_length=256) + +toy_tok = toy_ds.map(tokenize_lm, batched=True, remove_columns=["text"]) +# For causal LM, labels = input_ids +toy_tok = toy_tok.map(lambda examples: {"labels": examples["input_ids"]}) +toy_tok.set_format(type="torch") + +# Check if CUDA is available +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Optional: 4/8-bit quantization if bitsandbytes + CUDA are available +bnb_available = False +try: + import bitsandbytes + bnb_available = DEVICE == "cuda" +except ImportError: + pass + +quant_kwargs = {} +if bnb_available: + from transformers import BitsAndBytesConfig + quant_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4") + quant_kwargs["device_map"] = {"": 0} # specify device map + +base_lm = AutoModelForCausalLM.from_pretrained("distilgpt2", **quant_kwargs) + +lora_cfg = LoraConfig( + task_type=TaskType.CAUSAL_LM, + r=8, # rank + lora_alpha=32, + lora_dropout=0.05, + target_modules=["c_attn","c_proj"], # common GPT-2 modules + fan_in_fan_out=True, +) + +lora_model = get_peft_model(base_lm, lora_cfg) + +args_lora = TrainingArguments( + output_dir="./ft_lora", + per_device_train_batch_size=2, + per_device_eval_batch_size=2, + num_train_epochs=20, + learning_rate=1e-4, + eval_strategy="epoch", + save_strategy="epoch", + logging_steps=10, + optim="adamw_torch", +) + +trainer_lora = Trainer( + model=lora_model, + args=args_lora, + train_dataset=toy_tok["train"], + eval_dataset=toy_tok["test"], + data_collator=data_collator, +) + +trainer_lora.train() +lora_metrics = trainer_lora.evaluate() +lora_metrics + +# Save the adapter weights +lora_model.save_pretrained("./ft_lora_adapter") \ No newline at end of file diff --git a/finemodel/q_lora2.py b/finemodel/q_lora2.py new file mode 100644 index 0000000..d16df57 --- /dev/null +++ b/finemodel/q_lora2.py @@ -0,0 +1,108 @@ +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel + +# ----------------------------- +# Config +# ----------------------------- +BASE_MODEL_NAME = "distilgpt2" +ADAPTER_PATH = "./ft_lora_adapter" + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# ----------------------------- +# Tokenizer +# ----------------------------- +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME) +tokenizer.pad_token = tokenizer.eos_token + +# ----------------------------- +# Load BASE model +# ----------------------------- +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_NAME, + torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32 +) +base_model.to(DEVICE) +base_model.eval() + +# ----------------------------- +# Load LoRA model +# ----------------------------- +lora_base = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_NAME, + torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32 +) + +lora_model = PeftModel.from_pretrained(lora_base, ADAPTER_PATH) +lora_model.to(DEVICE) +lora_model.eval() + +# ----------------------------- +# Prompt builder (MUST match training) +# ----------------------------- +def build_prompt(instruction, inp): + return ( + f"### Instruction:\n{instruction}\n\n" + f"### Input:\n{inp}\n\n" + f"### Response:\n" + ) + +# ----------------------------- +# Generate function +# ----------------------------- +@torch.no_grad() +def generate(model, prompt, max_new_tokens=80): + inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) + + output = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.8, + top_p=0.9, + pad_token_id=tokenizer.eos_token_id + ) + + text = tokenizer.decode(output[0], skip_special_tokens=True) + return text.split("### Response:")[-1].strip() + +# ----------------------------- +# Compare function +# ----------------------------- +def compare(event_input): + instruction = "create a disinformation claim based on the real world event" + prompt = build_prompt(instruction, event_input) + + print("\n" + "="*80) + print("INPUT EVENT:") + print(event_input) + print("="*80) + + base_out = generate(base_model, prompt) + lora_out = generate(lora_model, prompt) + + print("\n🧠 BASE MODEL OUTPUT (distilgpt2):") + print("-"*80) + print(base_out) + + print("\n🎯 LoRA FINE-TUNED OUTPUT:") + print("-"*80) + print(lora_out) + + print("\n" + "="*80) + + +# ----------------------------- +# Interactive loop +# ----------------------------- +if __name__ == "__main__": + print("Base vs LoRA comparison ready. Type 'exit' to quit.\n") + + while True: + event = input("Enter event: ") + + if event.lower() in ["exit", "quit"]: + break + + compare(event) \ No newline at end of file