From 90894b2c10952ad39e16dfea3fe1b07e670b3d4a Mon Sep 17 00:00:00 2001 From: William Jeynes Date: Mon, 16 Feb 2026 14:42:47 +0000 Subject: [PATCH] Add some preliminary analysis --- agent/package.json | 4 +- supporting/scorer/analyse.py | 81 ++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 supporting/scorer/analyse.py diff --git a/agent/package.json b/agent/package.json index 9cf0c4a..44d1e5f 100644 --- a/agent/package.json +++ b/agent/package.json @@ -8,7 +8,9 @@ "main": "agent.ts", "scripts": { "agent": "npx @langchain/langgraph-cli dev", - "ragas_service": "cd ../supporting/RAGAS_Service && .venv/bin/uvicorn ragas_service:app --port 8001" + "ragas_service": "cd ../supporting/RAGAS_Service && .venv/bin/uvicorn ragas_service:app --port 8001", + "frontend": "cd ../supporting/scorer && .venv/bin/streamlit run display.py", + "fetch": "cd ../supporting/dbkf & python fetch.py" }, "dependencies": { "@huggingface/transformers": "^3.8.1", diff --git a/supporting/scorer/analyse.py b/supporting/scorer/analyse.py new file mode 100644 index 0000000..3db6019 --- /dev/null +++ b/supporting/scorer/analyse.py @@ -0,0 +1,81 @@ +import json +from statistics import mean + +# ------------------------------------------------------------ +# Load JSONL file +# ------------------------------------------------------------ +DATA_FILE = "../Wrapper/results.jsonl" + +data = [] +with open(DATA_FILE, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + data.append(json.loads(line)) + +# ------------------------------------------------------------ +# Extract events +# ------------------------------------------------------------ +all_events = [] +claims = [] + +for item in data: + if item.get("status") != "success": + continue + + claim_text = item.get("text", "") + outputs = item.get("output", []) + + for out in outputs: + if "content_parsed" in out: + events = out["content_parsed"] + + claims.append({ + "claim": claim_text, + "events": events + }) + + for ev in events: + score = ev["score"] + human = ev["human_score"] + + all_events.append({ + "claim": claim_text, + "event": ev["event"], + "reason": ev["reasoningWhyRelevant"], + "score": score, + "human_score": human, + "gap": abs(score - human), + }) + +# ------------------------------------------------------------ +# Compute metrics +# ------------------------------------------------------------ +if not all_events: + raise ValueError("No events found in file.") + +avg_score = mean(e["score"] for e in all_events) +avg_diff = mean(e["gap"] for e in all_events) + +largest_gap_event = max(all_events, key=lambda x: x["gap"]) +worst_event = largest_gap_event + +worst_claim_data = next( + c for c in claims if c["claim"] == worst_event["claim"] +) + +# ------------------------------------------------------------ +# Output results +# ------------------------------------------------------------ +print(f"Average score: {avg_score:.4f}") +print(f"Average |human_score - score|: {avg_diff:.4f}") + +print("\nLargest gap event:") +print(f"Event: {largest_gap_event['event']}") +print(f"Score: {largest_gap_event['score']}") +print(f"Human score: {largest_gap_event['human_score']}") +print(f"Gap: {largest_gap_event['gap']:.4f}") + +print("\nWorst performing event and its claims:") +print(f"Claim: {worst_event['claim']}") +print(f"Worst Event: {worst_event['event']}")