From 0e5a1c18cd409e09ca70351783e98453528a69bf Mon Sep 17 00:00:00 2001 From: WillJeynes Date: Sat, 11 Apr 2026 12:02:18 +0100 Subject: [PATCH] Add deepseek version, full trained version. Add results --- finemodel/README.md | 8 +++ finemodel/full.py | 111 +++++++++++++++++++++++++++++++++++ finemodel/lora23.py | 131 ++++++++++++++++++++++++++++++++++++++++++ finemodel/q_full.py | 85 +++++++++++++++++++++++++++ finemodel/q_lora23.py | 102 ++++++++++++++++++++++++++++++++ 5 files changed, 437 insertions(+) create mode 100644 finemodel/README.md create mode 100644 finemodel/full.py create mode 100644 finemodel/lora23.py create mode 100644 finemodel/q_full.py create mode 100644 finemodel/q_lora23.py diff --git a/finemodel/README.md b/finemodel/README.md new file mode 100644 index 0000000..9fd8a3e --- /dev/null +++ b/finemodel/README.md @@ -0,0 +1,8 @@ +# Results + +| Model/Technique | Coherence | Plausibility | Disinformation? | +|---------------------------------------|---------------|---------------|-------------------| +| distilGPT2 + LoRa | 6/9 | 4/9 | 2/9 +| miniLLama + LoRa | 7/9 | 6/9 | 4/9 +| deepseek + LoRa | 7/9 | 5/9 | 5/9 +| distilGPT2 (full training) | 4/9 | 3/9 | 2/9 \ No newline at end of file diff --git a/finemodel/full.py b/finemodel/full.py new file mode 100644 index 0000000..f508909 --- /dev/null +++ b/finemodel/full.py @@ -0,0 +1,111 @@ +import torch +from datasets import Dataset +from transformers import ( + AutoModelForCausalLM, + DataCollatorForLanguageModeling, + TrainingArguments, + Trainer, + AutoTokenizer +) +import pandas as pd + +# Load your CSV +df = pd.read_csv("../data/dataset.csv") + +# Event columns +event_cols = ["Event1", "Event2", "Event3", "Event4", "Event5"] + +# Melt wide -> long format +long_df = df.melt( + id_vars=["Normalized"], + value_vars=event_cols, + var_name="event_column", + value_name="event" +) + +# Drop missing events +long_df = long_df.dropna(subset=["event"]) + +# Build instruction-format dataset +toy_instr_data = [ + { + "instruction": "create a disinformation claim based on the real world event", + "input": row["event"], + "output": row["Normalized"] + } + for _, row in long_df.iterrows() +] + +model_name = "distilgpt2" +tok_gpt = AutoTokenizer.from_pretrained(model_name) + +tok_gpt.pad_token = tok_gpt.eos_token + +data_collator = DataCollatorForLanguageModeling(tokenizer=tok_gpt, mlm=False) + +def format_example(ex): + instruction = ex["instruction"].strip() + inp = ex.get("input", "").strip() + out = ex["output"].strip() + if inp: + prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n" + else: + prompt = f"### Instruction:\n{instruction}\n\n### Response:\n" + return prompt, out + +def build_text(example): + prompt, out = format_example(example) + return {"text": prompt + out + tok_gpt.eos_token} + +toy_ds = Dataset.from_list(toy_instr_data).map(build_text) +toy_ds = toy_ds.train_test_split(test_size=0.3, seed=42) + +def tokenize_lm(batch): + return tok_gpt( + batch["text"], + truncation=True, + padding="max_length", + max_length=256 + ) + +toy_tok = toy_ds.map(tokenize_lm, batched=True, remove_columns=["text"]) +toy_tok = toy_tok.map(lambda examples: {"labels": examples["input_ids"]}) +toy_tok.set_format(type="torch") + + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) + +args = TrainingArguments( + output_dir="./ft_gt_full", + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + gradient_accumulation_steps=1, + num_train_epochs=5, + learning_rate=5e-5, + eval_strategy="epoch", + save_strategy="epoch", + logging_steps=10, + optim="adamw_torch", + load_best_model_at_end=True, + metric_for_best_model="eval_loss", + greater_is_better=False, + fp16=torch.cuda.is_available(), +) + +trainer = Trainer( + model=model, + args=args, + train_dataset=toy_tok["train"], + eval_dataset=toy_tok["test"], + data_collator=data_collator, +) + +trainer.train() + +metrics = trainer.evaluate() +print(metrics) + +trainer.save_model("./ft_gt_full") +tok_gpt.save_pretrained("./ft_gt_full") \ No newline at end of file diff --git a/finemodel/lora23.py b/finemodel/lora23.py new file mode 100644 index 0000000..db90750 --- /dev/null +++ b/finemodel/lora23.py @@ -0,0 +1,131 @@ +# https://github.com/karimiannima/LLM-Fine-Tuning-Step-by-Step-Tutorial/blob/main/LLM_Fine_Tuning_Tutorial.ipynb +import torch +from peft import LoraConfig, get_peft_model, TaskType +from datasets import Dataset +from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer, AutoTokenizer +import pandas as pd + +# Load your CSV +df = pd.read_csv("../data/dataset.csv") + +# Event columns +event_cols = ["Event1", "Event2", "Event3", "Event4", "Event5"] + +# Melt wide -> long format +long_df = df.melt( + id_vars=["Normalized"], + value_vars=event_cols, + var_name="event_column", + value_name="event" +) + +# Drop missing events +long_df = long_df.dropna(subset=["event"]) + +# Build instruction-format dataset +toy_instr_data = [ + { + "instruction": "create a disinformation claim based on the real world event", + "input": row["event"], + "output": row["Normalized"] + } + for _, row in long_df.iterrows() +] + +# Example: print first few +print(toy_instr_data[:3]) + +tok_gpt = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B") +tok_gpt.pad_token = tok_gpt.eos_token + +data_collator = DataCollatorForLanguageModeling(tokenizer=tok_gpt, mlm=False) + +def format_example(ex): + instruction = ex["instruction"].strip() + inp = ex.get("input", "").strip() + out = ex["output"].strip() + if inp: + prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{inp}\n\n### Response:\n" + else: + prompt = f"### Instruction:\n{instruction}\n\n### Response:\n" + return prompt, out + +def build_text(example): + prompt, out = format_example(example) + return {"text": prompt + out + tok_gpt.eos_token} # assumes tok_gpt defined earlier + +toy_ds = Dataset.from_list(toy_instr_data).map(build_text) +toy_ds = toy_ds.train_test_split(test_size=0.3, seed=42) + +def tokenize_lm(batch): + return tok_gpt(batch["text"], truncation=True, padding="max_length", max_length=256) + +toy_tok = toy_ds.map(tokenize_lm, batched=True, remove_columns=["text"]) +# For causal LM, labels = input_ids +toy_tok = toy_tok.map(lambda examples: {"labels": examples["input_ids"]}) +toy_tok.set_format(type="torch") + +# Check if CUDA is available +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# Optional: 4/8-bit quantization if bitsandbytes + CUDA are available +bnb_available = False +try: + import bitsandbytes + bnb_available = DEVICE == "cuda" +except ImportError: + pass + +quant_kwargs = {} +if bnb_available: + from transformers import BitsAndBytesConfig + quant_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4") + quant_kwargs["device_map"] = {"": 0} # specify device map + +base_lm = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", **quant_kwargs) + + +lora_cfg = LoraConfig( + task_type=TaskType.CAUSAL_LM, + r=8, + lora_alpha=32, + lora_dropout=0.05, + target_modules=[ + "q_proj", + "k_proj", + "v_proj", + "o_proj" + ] +) + +lora_model = get_peft_model(base_lm, lora_cfg) + +args_lora = TrainingArguments( + output_dir="./ft_ds_lora", + per_device_train_batch_size=1, + per_device_eval_batch_size=1, + num_train_epochs=5, + learning_rate=2e-5, + eval_strategy="epoch", + save_strategy="epoch", + logging_steps=10, + optim="adamw_torch", + load_best_model_at_end=True, + metric_for_best_model="eval_loss", + greater_is_better=False +) + +trainer_lora = Trainer( + model=lora_model, + args=args_lora, + train_dataset=toy_tok["train"], + eval_dataset=toy_tok["test"], + data_collator=data_collator, +) + +trainer_lora.train() +lora_metrics = trainer_lora.evaluate() +lora_metrics + +# Save the adapter weights +lora_model.save_pretrained("./ft_ds_lora_adapter") \ No newline at end of file diff --git a/finemodel/q_full.py b/finemodel/q_full.py new file mode 100644 index 0000000..09e58d7 --- /dev/null +++ b/finemodel/q_full.py @@ -0,0 +1,85 @@ +import torch +from fastapi import FastAPI +from pydantic import BaseModel +from transformers import AutoTokenizer, AutoModelForCausalLM + +# ----------------------------- +# Config +# ----------------------------- +MODEL_PATH = "./ft_gt_full" # your saved FT model + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +app = FastAPI(title="DistilGPT2 FT API") + +# ----------------------------- +# Request schema +# ----------------------------- +class EventRequest(BaseModel): + event: str + max_new_tokens: int = 80 + + +# ----------------------------- +# Load tokenizer + model +# ----------------------------- +tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) +tokenizer.pad_token = tokenizer.eos_token + +model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, + torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32 +) + +model.to(DEVICE) +model.eval() + + +# ----------------------------- +# Prompt builder +# ----------------------------- +def build_prompt(instruction, inp): + return ( + f"### Instruction:\n{instruction}\n\n" + f"### Input:\n{inp}\n\n" + f"### Response:\n" + ) + + +# ----------------------------- +# Generate function +# ----------------------------- +@torch.no_grad() +def generate(prompt, max_new_tokens=80): + inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) + + output = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.8, + top_p=0.9, + pad_token_id=tokenizer.eos_token_id + ) + + text = tokenizer.decode(output[0], skip_special_tokens=True) + + # Extract only response part + return text.split("### Response:")[-1].strip() + + +# ----------------------------- +# API Endpoint +# ----------------------------- +@app.post("/compare") +def generate_claim(req: EventRequest): + instruction = "create a disinformation claim based on the real world event" + prompt = build_prompt(instruction, req.event) + + output = generate(prompt, req.max_new_tokens) + + return { + "input_event": req.event, + "base_output": "N/A", + "lora_output": output + } \ No newline at end of file diff --git a/finemodel/q_lora23.py b/finemodel/q_lora23.py new file mode 100644 index 0000000..a359612 --- /dev/null +++ b/finemodel/q_lora23.py @@ -0,0 +1,102 @@ +import torch +from fastapi import FastAPI +from pydantic import BaseModel +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel + +# ----------------------------- +# Config +# ----------------------------- +BASE_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" +ADAPTER_PATH = "./ft_ds_lora_adapter" + +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +app = FastAPI(title="Base vs LoRA API") + +# ----------------------------- +# Request schema +# ----------------------------- +class EventRequest(BaseModel): + event: str + max_new_tokens: int = 80 + + +# ----------------------------- +# Load tokenizer +# ----------------------------- +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME) +tokenizer.pad_token = tokenizer.eos_token + + +# ----------------------------- +# Load BASE model +# ----------------------------- +# base_model = AutoModelForCausalLM.from_pretrained( +# BASE_MODEL_NAME, +# torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32 +# ) +# base_model.to(DEVICE) +# base_model.eval() + + +# ----------------------------- +# Load LoRA model +# ----------------------------- +lora_base = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_NAME, + torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32 +) + +lora_model = PeftModel.from_pretrained(lora_base, ADAPTER_PATH) +lora_model.to(DEVICE) +lora_model.eval() + + +# ----------------------------- +# Prompt builder +# ----------------------------- +def build_prompt(instruction, inp): + return ( + f"### Instruction:\n{instruction}\n\n" + f"### Input:\n{inp}\n\n" + f"### Response:\n" + ) + + +# ----------------------------- +# Generate function +# ----------------------------- +@torch.no_grad() +def generate(model, prompt, max_new_tokens=80): + inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE) + + output = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.8, + top_p=0.9, + pad_token_id=tokenizer.eos_token_id + ) + + text = tokenizer.decode(output[0], skip_special_tokens=True) + return text.split("### Response:")[-1].strip() + + +# ----------------------------- +# API Endpoint +# ----------------------------- +@app.post("/compare") +def compare(req: EventRequest): + instruction = "create a disinformation claim based on the real world event" + prompt = build_prompt(instruction, req.event) + + # base_out = generate(base_model, prompt, req.max_new_tokens) + lora_out = generate(lora_model, prompt, req.max_new_tokens) + + return { + "input_event": req.event, + "base_output": "NONE", + "lora_output": lora_out + } \ No newline at end of file