From f09e36e74028894d27fa58bf6b24aada20b93827 Mon Sep 17 00:00:00 2001 From: William Jeynes Date: Wed, 11 Mar 2026 22:02:31 +0000 Subject: [PATCH] Add initial version of ROBERTA classifier, add ability for multi pi charts --- supporting/RAGAS_Service/.gitignore | 5 + supporting/RAGAS_Service/prepare_data.py | 22 +++ supporting/RAGAS_Service/requirements.txt | 4 + supporting/RAGAS_Service/roberta_service.py | 25 +++ supporting/RAGAS_Service/train_roberta.py | 186 ++++++++++++++++++++ supporting/scorer/views/stats.py | 90 ++++++---- 6 files changed, 299 insertions(+), 33 deletions(-) create mode 100644 supporting/RAGAS_Service/prepare_data.py create mode 100644 supporting/RAGAS_Service/roberta_service.py create mode 100644 supporting/RAGAS_Service/train_roberta.py diff --git a/supporting/RAGAS_Service/.gitignore b/supporting/RAGAS_Service/.gitignore index 64d49ae..3f97425 100644 --- a/supporting/RAGAS_Service/.gitignore +++ b/supporting/RAGAS_Service/.gitignore @@ -1,3 +1,8 @@ +# -- OURS -- +results/ +roberta_classifier/ + +# -- THEIRS -- # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] diff --git a/supporting/RAGAS_Service/prepare_data.py b/supporting/RAGAS_Service/prepare_data.py new file mode 100644 index 0000000..e75d278 --- /dev/null +++ b/supporting/RAGAS_Service/prepare_data.py @@ -0,0 +1,22 @@ +import json +import csv + +input_file = "../../data/input.jsonl" +output_file = "../../data/classify.csv" + +with open(input_file, "r", encoding="utf-8") as infile, \ + open(output_file, "w", newline="", encoding="utf-8") as outfile: + + writer = csv.writer(outfile) + writer.writerow(["event", "extra_info"]) # header + + for line in infile: + data = json.loads(line) + + events = data.get("events", []) + for event in events: + event_text = event.get("event", "") + extra_info = event.get("extra_info", "").strip() + writer.writerow([event_text, extra_info]) + +print(f"Saved CSV to {output_file}") \ No newline at end of file diff --git a/supporting/RAGAS_Service/requirements.txt b/supporting/RAGAS_Service/requirements.txt index 3c87cd7..be1aa73 100644 --- a/supporting/RAGAS_Service/requirements.txt +++ b/supporting/RAGAS_Service/requirements.txt @@ -6,6 +6,10 @@ uvicorn[standard] ragas datasets +# ROBERTA +scikit-learn +transformers[torch] + # Utils numpy pandas diff --git a/supporting/RAGAS_Service/roberta_service.py b/supporting/RAGAS_Service/roberta_service.py new file mode 100644 index 0000000..a01f848 --- /dev/null +++ b/supporting/RAGAS_Service/roberta_service.py @@ -0,0 +1,25 @@ +from transformers import RobertaTokenizer, RobertaForSequenceClassification +import torch + +MODEL_PATH = "./roberta_classifier" + +tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH) +model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH) + +text2 = "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in March–August 2020)" +text = "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film." + +inputs = tokenizer( + text, + return_tensors="pt", + truncation=True, + padding=True +) + +model.eval() + +with torch.no_grad(): + logits = model(**inputs).logits + +probs = torch.softmax(logits, dim=1) +print(probs) \ No newline at end of file diff --git a/supporting/RAGAS_Service/train_roberta.py b/supporting/RAGAS_Service/train_roberta.py new file mode 100644 index 0000000..b4abf95 --- /dev/null +++ b/supporting/RAGAS_Service/train_roberta.py @@ -0,0 +1,186 @@ +from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments +import torch +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score +from collections import Counter +import sys +import csv + +NUM_CLASSES = 3 +model_name = "roberta-base" + +LABEL_PRIORITY = [ + ("PERFECT", 0), + ("STORY", 1), + ("NSPECIFIC", 2), + ("REWORDING", 2), + ("TINCORRECT", -1), + ("DUPLICATE", -1), + ("", 2), # fallback to PERFECT +] + +def label_to_int(extra_info: str) -> int: + """ + Convert extra_info string to integer label using priority rules. + """ + + if extra_info is None: + extra_info = "" + + extra_info = extra_info.strip() + + # Handle empty string explicitly + if extra_info == "": + for key, value in LABEL_PRIORITY: + if key == "": + return value + raise ValueError("Empty extra_info but no empty mapping defined") + + # Split words (case-insensitive) + tokens = set(extra_info.upper().split()) + + # Priority matching + for key, value in LABEL_PRIORITY: + if key == "": + continue + + if key in tokens: + return value + + raise ValueError(f"Unknown label content: '{extra_info}'") + + +def load_dataset_from_csv(path): + texts = [] + labels = [] + + removed_rows = 0 + + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + + for i, row in enumerate(reader, start=1): + text = row["event"] + label_str = row["extra_info"] + + try: + label_int = label_to_int(label_str) + except Exception as e: + print(f"ERROR converting label on line {i}: {label_str}") + print(e) + sys.exit(1) + + # Skip rows marked for removal + if label_int == -1: + removed_rows += 1 + continue + + texts.append(text) + labels.append(label_int) + + print(f"Loaded {len(texts)} samples (removed {removed_rows})") + + return texts, labels + + + +def compute_metrics(eval_pred): + logits, labels = eval_pred + preds = logits.argmax(axis=1) + + return { + "accuracy": accuracy_score(labels, preds), + "f1": f1_score(labels, preds, average="weighted"), + "precision": precision_score(labels, preds, average="weighted"), + "recall": recall_score(labels, preds, average="weighted"), + } + +texts, labels = load_dataset_from_csv("../../data/classify.csv") + +tokenizer = RobertaTokenizer.from_pretrained(model_name) +model = RobertaForSequenceClassification.from_pretrained( + model_name, + num_labels=NUM_CLASSES +) + +for param in model.roberta.parameters(): + param.requires_grad = False + +for param in model.roberta.encoder.layer[-2:].parameters(): + param.requires_grad = True + +print("Dataset size:", len(texts)) +print("Label distribution:") +print(Counter(labels)) + +train_texts, val_texts, train_labels, val_labels = train_test_split( + texts, + labels, + test_size=0.2, + random_state=42 +) + +train_encodings = tokenizer( + train_texts, + truncation=True, + padding=True, + max_length=256 +) + +val_encodings = tokenizer( + val_texts, + truncation=True, + padding=True, + max_length=256 +) + +class TextDataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = { + key: torch.tensor(val[idx]) + for key, val in self.encodings.items() + } + item["labels"] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) + +training_args = TrainingArguments( + output_dir="./results", + learning_rate=1e-5, + per_device_train_batch_size=8, + num_train_epochs=15, + weight_decay=0.01, + load_best_model_at_end=True, + eval_strategy="epoch", + save_strategy="epoch", + metric_for_best_model="f1", + greater_is_better=True +) + +train_dataset = TextDataset(train_encodings, train_labels) + +val_dataset = TextDataset(val_encodings, val_labels) + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + compute_metrics=compute_metrics +) + +trainer.train() + +metrics = trainer.evaluate() +print("Final evaluation metrics:") +for k, v in metrics.items(): + print(f"{k}: {v}") + +trainer.save_model("./roberta_classifier") +tokenizer.save_pretrained("./roberta_classifier") \ No newline at end of file diff --git a/supporting/scorer/views/stats.py b/supporting/scorer/views/stats.py index a9499af..70b2436 100644 --- a/supporting/scorer/views/stats.py +++ b/supporting/scorer/views/stats.py @@ -1,4 +1,6 @@ from collections import Counter +from pathlib import Path +import json import streamlit as st import pandas as pd import matplotlib.pyplot as plt @@ -27,22 +29,6 @@ def render(): words = extra.strip().split() word_counter.update(words) - # ---- confidence classification ---- - if score is not None: - extra_lower = extra.strip().lower() - - if score > THRESH and extra_lower == "perfect": - confidence_counter["Correct"] += 1 - - elif score > THRESH and extra_lower != "perfect": - confidence_counter["Over-confident"] += 1 - - elif score < THRESH and extra_lower == "perfect": - confidence_counter["Under-confident"] += 1 - - else: - confidence_counter["Other"] += 1 - # -------------------------- # Extra Info Word Counts # -------------------------- @@ -62,24 +48,62 @@ def render(): # -------------------------- # Confidence vs Label Stats # -------------------------- - st.subheader("Confidence vs Label Distribution") + st.header("Confidence vs Label Distribution per JSONL File") - if confidence_counter: - df_conf = pd.DataFrame( - confidence_counter.items(), - columns=["Category", "Count"] - ) + path = Path("../../data/reranked") - fig, ax = plt.subplots() - ax.pie( - df_conf["Count"], - labels=df_conf["Category"], - autopct="%1.1f%%", - startangle=90 - ) - ax.axis("equal") + if not path.exists() or not path.is_dir(): + st.error("Invalid folder path.") + return - st.pyplot(fig, width=500) + jsonl_files = list(path.glob("*.jsonl")) + if not jsonl_files: + st.info("No .jsonl files found in this folder.") + return - else: - st.info("No score data available yet.") \ No newline at end of file + for file_path in jsonl_files: + st.subheader(f"File: {file_path.name}") + + confidence_counter = Counter() + + # ---- Read file line by line ---- + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + + for event in entry.get("events", []): + score = event.get("score", None) + extra_lower = (event.get("extra_info", "") or "").strip().lower() + print(extra_lower) + if score is not None: + if score > THRESH and extra_lower == "perfect": + confidence_counter["Correct"] += 1 + elif score > THRESH and extra_lower != "perfect": + confidence_counter["Over-confident"] += 1 + elif score < THRESH and extra_lower == "perfect": + confidence_counter["Under-confident"] += 1 + else: + confidence_counter["Other"] += 1 + + if confidence_counter: + df_conf = pd.DataFrame( + confidence_counter.items(), + columns=["Category", "Count"] + ) + + fig, ax = plt.subplots() + ax.pie( + df_conf["Count"], + labels=df_conf["Category"], + autopct="%1.1f%%", + startangle=90 + ) + ax.axis("equal") + ax.set_title(file_path.name) + + st.pyplot(fig, width=500) + else: + st.info("No score data available in this file.") \ No newline at end of file