Add initial version of ROBERTA classifier, add ability for multi pi charts

This commit is contained in:
William Jeynes
2026-03-11 22:02:31 +00:00
parent ef6330ec07
commit f09e36e740
6 changed files with 299 additions and 33 deletions
+5
View File
@@ -1,3 +1,8 @@
# -- OURS --
results/
roberta_classifier/
# -- THEIRS --
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[codz] *.py[codz]
+22
View File
@@ -0,0 +1,22 @@
import json
import csv
input_file = "../../data/input.jsonl"
output_file = "../../data/classify.csv"
with open(input_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", newline="", encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow(["event", "extra_info"]) # header
for line in infile:
data = json.loads(line)
events = data.get("events", [])
for event in events:
event_text = event.get("event", "")
extra_info = event.get("extra_info", "").strip()
writer.writerow([event_text, extra_info])
print(f"Saved CSV to {output_file}")
@@ -6,6 +6,10 @@ uvicorn[standard]
ragas ragas
datasets datasets
# ROBERTA
scikit-learn
transformers[torch]
# Utils # Utils
numpy numpy
pandas pandas
@@ -0,0 +1,25 @@
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
MODEL_PATH = "./roberta_classifier"
tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)
model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH)
text2 = "High-profile political downplaying of COVID-19 (examples: President Trump saying 'it will go away' in MarchAugust 2020)"
text = "Multiple mirrored reuploads (20202023) put the clip on other channels with titles implying it was a genuine 1970s public information film."
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True
)
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=1)
print(probs)
+186
View File
@@ -0,0 +1,186 @@
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from collections import Counter
import sys
import csv
NUM_CLASSES = 3
model_name = "roberta-base"
LABEL_PRIORITY = [
("PERFECT", 0),
("STORY", 1),
("NSPECIFIC", 2),
("REWORDING", 2),
("TINCORRECT", -1),
("DUPLICATE", -1),
("", 2), # fallback to PERFECT
]
def label_to_int(extra_info: str) -> int:
"""
Convert extra_info string to integer label using priority rules.
"""
if extra_info is None:
extra_info = ""
extra_info = extra_info.strip()
# Handle empty string explicitly
if extra_info == "":
for key, value in LABEL_PRIORITY:
if key == "":
return value
raise ValueError("Empty extra_info but no empty mapping defined")
# Split words (case-insensitive)
tokens = set(extra_info.upper().split())
# Priority matching
for key, value in LABEL_PRIORITY:
if key == "":
continue
if key in tokens:
return value
raise ValueError(f"Unknown label content: '{extra_info}'")
def load_dataset_from_csv(path):
texts = []
labels = []
removed_rows = 0
with open(path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, start=1):
text = row["event"]
label_str = row["extra_info"]
try:
label_int = label_to_int(label_str)
except Exception as e:
print(f"ERROR converting label on line {i}: {label_str}")
print(e)
sys.exit(1)
# Skip rows marked for removal
if label_int == -1:
removed_rows += 1
continue
texts.append(text)
labels.append(label_int)
print(f"Loaded {len(texts)} samples (removed {removed_rows})")
return texts, labels
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = logits.argmax(axis=1)
return {
"accuracy": accuracy_score(labels, preds),
"f1": f1_score(labels, preds, average="weighted"),
"precision": precision_score(labels, preds, average="weighted"),
"recall": recall_score(labels, preds, average="weighted"),
}
texts, labels = load_dataset_from_csv("../../data/classify.csv")
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(
model_name,
num_labels=NUM_CLASSES
)
for param in model.roberta.parameters():
param.requires_grad = False
for param in model.roberta.encoder.layer[-2:].parameters():
param.requires_grad = True
print("Dataset size:", len(texts))
print("Label distribution:")
print(Counter(labels))
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts,
labels,
test_size=0.2,
random_state=42
)
train_encodings = tokenizer(
train_texts,
truncation=True,
padding=True,
max_length=256
)
val_encodings = tokenizer(
val_texts,
truncation=True,
padding=True,
max_length=256
)
class TextDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {
key: torch.tensor(val[idx])
for key, val in self.encodings.items()
}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
training_args = TrainingArguments(
output_dir="./results",
learning_rate=1e-5,
per_device_train_batch_size=8,
num_train_epochs=15,
weight_decay=0.01,
load_best_model_at_end=True,
eval_strategy="epoch",
save_strategy="epoch",
metric_for_best_model="f1",
greater_is_better=True
)
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics
)
trainer.train()
metrics = trainer.evaluate()
print("Final evaluation metrics:")
for k, v in metrics.items():
print(f"{k}: {v}")
trainer.save_model("./roberta_classifier")
tokenizer.save_pretrained("./roberta_classifier")
+43 -19
View File
@@ -1,4 +1,6 @@
from collections import Counter from collections import Counter
from pathlib import Path
import json
import streamlit as st import streamlit as st
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@@ -27,22 +29,6 @@ def render():
words = extra.strip().split() words = extra.strip().split()
word_counter.update(words) word_counter.update(words)
# ---- confidence classification ----
if score is not None:
extra_lower = extra.strip().lower()
if score > THRESH and extra_lower == "perfect":
confidence_counter["Correct"] += 1
elif score > THRESH and extra_lower != "perfect":
confidence_counter["Over-confident"] += 1
elif score < THRESH and extra_lower == "perfect":
confidence_counter["Under-confident"] += 1
else:
confidence_counter["Other"] += 1
# -------------------------- # --------------------------
# Extra Info Word Counts # Extra Info Word Counts
# -------------------------- # --------------------------
@@ -62,7 +48,45 @@ def render():
# -------------------------- # --------------------------
# Confidence vs Label Stats # Confidence vs Label Stats
# -------------------------- # --------------------------
st.subheader("Confidence vs Label Distribution") st.header("Confidence vs Label Distribution per JSONL File")
path = Path("../../data/reranked")
if not path.exists() or not path.is_dir():
st.error("Invalid folder path.")
return
jsonl_files = list(path.glob("*.jsonl"))
if not jsonl_files:
st.info("No .jsonl files found in this folder.")
return
for file_path in jsonl_files:
st.subheader(f"File: {file_path.name}")
confidence_counter = Counter()
# ---- Read file line by line ----
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
for event in entry.get("events", []):
score = event.get("score", None)
extra_lower = (event.get("extra_info", "") or "").strip().lower()
print(extra_lower)
if score is not None:
if score > THRESH and extra_lower == "perfect":
confidence_counter["Correct"] += 1
elif score > THRESH and extra_lower != "perfect":
confidence_counter["Over-confident"] += 1
elif score < THRESH and extra_lower == "perfect":
confidence_counter["Under-confident"] += 1
else:
confidence_counter["Other"] += 1
if confidence_counter: if confidence_counter:
df_conf = pd.DataFrame( df_conf = pd.DataFrame(
@@ -78,8 +102,8 @@ def render():
startangle=90 startangle=90
) )
ax.axis("equal") ax.axis("equal")
ax.set_title(file_path.name)
st.pyplot(fig, width=500) st.pyplot(fig, width=500)
else: else:
st.info("No score data available yet.") st.info("No score data available in this file.")