diff --git a/agent/tools/robertaCall.ts b/agent/tools/robertaCall.ts index 571826d..2ba131b 100644 --- a/agent/tools/robertaCall.ts +++ b/agent/tools/robertaCall.ts @@ -10,7 +10,7 @@ export async function evaluateWithRoberta({ }); // console.log(res.data) const validProb = res.data["probabilities"][0][0] - const invalidProb = res.data["probabilities"][0][1] + const invalidProb = res.data["probabilities"][0][1] + res.data["probabilities"][0][2] return {validProb, invalidProb}; } @@ -19,4 +19,7 @@ export async function evaluateWithRoberta({ // console.log(res) // res = await evaluateWithRoberta({answer: "Multiple mirrored reuploads (2020–2023) put the clip on other channels with titles implying it was a genuine 1970s public information film."}); +// console.log(res) + +// res = await evaluateWithRoberta({answer: "The COVID-19 Pandemic"}); // console.log(res) \ No newline at end of file diff --git a/supporting/RAGAS_Service/.gitignore b/supporting/RAGAS_Service/.gitignore index 3f97425..b9df52e 100644 --- a/supporting/RAGAS_Service/.gitignore +++ b/supporting/RAGAS_Service/.gitignore @@ -1,6 +1,7 @@ # -- OURS -- results/ roberta_classifier/ +roberta_classifier*/ # -- THEIRS -- # Byte-compiled / optimized / DLL files diff --git a/supporting/RAGAS_Service/generate_adversarial.py b/supporting/RAGAS_Service/generate_adversarial.py new file mode 100644 index 0000000..6e6240c --- /dev/null +++ b/supporting/RAGAS_Service/generate_adversarial.py @@ -0,0 +1,72 @@ +import json +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed +from threading import Lock +from tqdm import tqdm +from dotenv import load_dotenv +from openai import OpenAI + +ENV_PATH = Path("../../agent/.env") +load_dotenv(dotenv_path=ENV_PATH) + +client = OpenAI() + +INPUT_FILE = "../../data/reranked/0_original.jsonl" +OUTPUT_FILE = "output.txt" +MODEL = "gpt-5-nano" + +MAX_WORKERS = 60 # tune this + +write_lock = Lock() + + +def make_request(line): + try: + data = json.loads(line) + prompt = ( + "Provide a story item for the spread of a disinformation claim" + "that is related to the topic: " + + data.get("text", "") + + " Include just the event no other text." + + " A good example would be 'No immediate U.S. government confirmation and near‑simultaneous fact‑checks/debunks appeared (fact‑checks published June 26, 2024).' and 'Recycled/old footage of aircraft being shot down previously viral and repeatedly misattributed to the Russia–Ukraine war (e.g., 2011 Libya footage reused in 2022)'" + + " If you cannot answer just return an empty string" + + " Be concise, make no mistakes" + ) + + if not prompt: + return "" + + response = client.responses.create( + model=MODEL, + input=prompt + ) + + text = response.output_text.strip() if response.output_text else "" + + if text and "\n" not in text and "sorry" not in text.lower() and "you" not in text.lower(): + return text + + return "" + + except Exception as e: + return "" + + +def process_file(input_path, output_path): + with open(input_path, "r", encoding="utf-8") as infile: + lines = list(infile) + + with open(output_path, "w", encoding="utf-8") as outfile: + with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = [executor.submit(make_request, line) for line in lines] + + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"): + result = future.result() + if result: + # 🔒 ensure thread-safe writes + with write_lock: + outfile.write(result + ",NSPECIFIC\n") + + +if __name__ == "__main__": + process_file(INPUT_FILE, OUTPUT_FILE) \ No newline at end of file diff --git a/supporting/RAGAS_Service/generate_adversarial2.py b/supporting/RAGAS_Service/generate_adversarial2.py new file mode 100644 index 0000000..abb0958 --- /dev/null +++ b/supporting/RAGAS_Service/generate_adversarial2.py @@ -0,0 +1,61 @@ +import csv +from transformers import MarianMTModel, MarianTokenizer +from tqdm import tqdm + +input_csv = "../../data/classify.csv" +output_csv = "output.csv" +labels_to_augment = ["STORY", "NSPECIFIC"] +intermediate_lang = "fr" +num_return_sequences = 1 + +# English to Intermediate language +model_name_src = f"Helsinki-NLP/opus-mt-en-{intermediate_lang}" +tokenizer_src = MarianTokenizer.from_pretrained(model_name_src) +model_src = MarianMTModel.from_pretrained(model_name_src) + +# Intermediate language to English +model_name_back = f"Helsinki-NLP/opus-mt-{intermediate_lang}-en" +tokenizer_back = MarianTokenizer.from_pretrained(model_name_back) +model_back = MarianMTModel.from_pretrained(model_name_back) + +def back_translate(text): + # Step 1: English to Intermediate + batch = tokenizer_src([text], return_tensors="pt", padding=True) + translated = model_src.generate(**batch, max_length=256) + intermediate_text = tokenizer_src.decode(translated[0], skip_special_tokens=True) + + # Step 2: Intermediate to English + batch_back = tokenizer_back([intermediate_text], return_tensors="pt", padding=True) + back_translated = model_back.generate(**batch_back, max_length=256, num_beams=5, num_return_sequences=num_return_sequences) + augmented_texts = [tokenizer_back.decode(t, skip_special_tokens=True) for t in back_translated] + return augmented_texts + +augmented_rows = [] + +with open(input_csv, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in tqdm(reader, desc="Processing CSV"): + event = row["event"] + label = row["extra_info"] + + # Keep original row + augmented_rows.append({"event": event, "label": label}) + + # Only augment certain labels + if label in labels_to_augment: + try: + new_texts = back_translate(event) + for t in new_texts: + augmented_rows.append({"event": t, "label": label}) + except Exception as e: + print(f"Error back-translating row: {event}") + print(e) + +with open(output_csv, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=["event", "label"]) + writer.writeheader() + for row in augmented_rows: + writer.writerow(row) + +print(f"Saved augmented dataset to {output_csv}") +print(f"Original size: {len(augmented_rows)} rows (includes originals + augmented)") \ No newline at end of file diff --git a/supporting/RAGAS_Service/temp.py b/supporting/RAGAS_Service/temp.py deleted file mode 100644 index b75b788..0000000 --- a/supporting/RAGAS_Service/temp.py +++ /dev/null @@ -1,45 +0,0 @@ -import pandas as pd -import numpy as np -from sentence_transformers import SentenceTransformer -from sklearn.metrics.pairwise import cosine_similarity - -# CONFIG -CSV_PATH = "../../data/classify.csv" -EVENT_COLUMN = "event" -TOP_K = 60 - -# Load CSV -df = pd.read_csv(CSV_PATH) - -events = df[EVENT_COLUMN].astype(str).tolist() - -# Load embedding model -model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") - -print("Embedding events...") -embeddings = model.encode(events, batch_size=32, show_progress_bar=True) - -# Compute cosine similarity matrix -sim_matrix = cosine_similarity(embeddings) - -# Collect pair similarities -pairs = [] - -n = len(events) -for i in range(n): - for j in range(i + 1, n): # avoid duplicates and self comparisons - pairs.append((sim_matrix[i][j], i, j)) - -# Sort by similarity descending -pairs.sort(reverse=True, key=lambda x: x[0]) - -# Top K pairs -top_pairs = pairs[:TOP_K] - -print("\nTop Similar Event Pairs:\n") - -for score, i, j in top_pairs: - print(f"Similarity: {score:.4f}") - print(f"Event 1: {events[i]}") - print(f"Event 2: {events[j]}") - print("-" * 60) \ No newline at end of file diff --git a/supporting/RAGAS_Service/train_roberta.py b/supporting/RAGAS_Service/train_roberta.py index 58299a0..a1c5145 100644 --- a/supporting/RAGAS_Service/train_roberta.py +++ b/supporting/RAGAS_Service/train_roberta.py @@ -1,3 +1,5 @@ +from sklearn.utils import compute_class_weight +from torch.nn import CrossEntropyLoss from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments import torch from sklearn.model_selection import train_test_split @@ -5,20 +7,36 @@ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_sc from collections import Counter import sys import csv +import numpy as np -NUM_CLASSES = 2 +NUM_CLASSES = 3 model_name = "roberta-base" LABEL_PRIORITY = [ ("PERFECT", 0), ("STORY", 1), - ("NSPECIFIC", 1), + ("NSPECIFIC", 2), ("REWORDING", 1), ("TINCORRECT", -1), ("DUPLICATE", -1), ("", 0), # fallback to PERFECT ] +class WeightedTrainer(Trainer): + def __init__(self, *args, class_weights=None, **kwargs): + super().__init__(*args, **kwargs) + self.class_weights = class_weights + + def compute_loss(self, model, inputs, return_outputs=False, **kwargs): + labels = inputs.get("labels") + outputs = model(**inputs) + logits = outputs.get("logits") + + loss_fct = CrossEntropyLoss(weight=self.class_weights.to(logits.device)) + loss = loss_fct(logits, labels) + + return (loss, outputs) if return_outputs else loss + def label_to_int(extra_info: str) -> int: """ Convert extra_info string to integer label using priority rules. @@ -90,9 +108,9 @@ def compute_metrics(eval_pred): return { "accuracy": accuracy_score(labels, preds), - "f1": f1_score(labels, preds, average="weighted"), - "precision": precision_score(labels, preds, average="weighted"), - "recall": recall_score(labels, preds, average="weighted"), + "f1": f1_score(labels, preds, average="weighted", zero_division=0), + "precision": precision_score(labels, preds, average="weighted", zero_division=0), + "recall": recall_score(labels, preds, average="weighted", zero_division=0), } texts, labels = load_dataset_from_csv("../../data/classify.csv") @@ -106,7 +124,7 @@ model = RobertaForSequenceClassification.from_pretrained( for param in model.roberta.parameters(): param.requires_grad = False -for param in model.roberta.encoder.layer[-2:].parameters(): +for param in model.roberta.encoder.layer[-3:].parameters(): param.requires_grad = True print("Dataset size:", len(texts)) @@ -120,6 +138,16 @@ train_texts, val_texts, train_labels, val_labels = train_test_split( random_state=42 ) + +class_weights = compute_class_weight( + class_weight="balanced", + classes=np.unique(train_labels), + y=train_labels +) + +class_weights = torch.tensor(class_weights, dtype=torch.float) +print("Class weights:", class_weights) + train_encodings = tokenizer( train_texts, truncation=True, @@ -160,19 +188,21 @@ training_args = TrainingArguments( eval_strategy="epoch", save_strategy="epoch", metric_for_best_model="f1", - greater_is_better=True + greater_is_better=True, + dataloader_pin_memory=False ) train_dataset = TextDataset(train_encodings, train_labels) val_dataset = TextDataset(val_encodings, val_labels) -trainer = Trainer( +trainer = WeightedTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, - compute_metrics=compute_metrics + compute_metrics=compute_metrics, + class_weights=class_weights ) trainer.train() diff --git a/supporting/scorer/views/stats.py b/supporting/scorer/views/stats.py index bdc4ff0..a2a2585 100644 --- a/supporting/scorer/views/stats.py +++ b/supporting/scorer/views/stats.py @@ -63,6 +63,7 @@ def render(): st.subheader(f"File: {file_path.name}") confidence_counter = Counter() + wrong_counter = Counter() overconfident_docs = [] underconfident_docs = [] dup_counter = 0 @@ -90,6 +91,7 @@ def render(): confidence_counter["Correct-FINE"] += 1 elif score > THRESH and extra_lower != "perfect" and extra_lower != "": confidence_counter["Over-confident"] += 1 + wrong_counter[extra_lower] += 1 overconfident_docs.append(doc_id) elif score < THRESH and (extra_lower == "perfect" or extra_lower == ""): confidence_counter["Under-confident"] += 1 @@ -134,5 +136,12 @@ def render(): st.container(height=200).write(sorted(set(underconfident_docs))) else: st.info("None") + + df_words = ( + pd.DataFrame(wrong_counter.items(), columns=["Label", "Count"]) + .sort_values("Count", ascending=False) + ) + + st.dataframe(df_words) else: st.info("No score data available in this file.") \ No newline at end of file