Add date ranges to frontend visualisation

This commit is contained in:
William Jeynes
2026-04-24 16:40:10 +01:00
parent f5f8800173
commit ea220e023c
6 changed files with 526 additions and 24 deletions
+35 -21
View File
@@ -1,8 +1,7 @@
import csv
import json
import uuid
from typing import List, Dict
import dateparser
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
@@ -10,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
INPUT_CSV = "../../data/dataset.csv"
INPUT_CSV = "../../data/dataset.jsonl"
OUTPUT_JSON = "../../data/clustered_output.json"
MODEL_NAME = "all-MiniLM-L6-v2"
SIMILARITY_THRESHOLD = 0.8
@@ -19,37 +18,50 @@ def generate_guid():
return str(uuid.uuid4())
def read_csv(file_path: str):
def read_jsonl(file_path: str):
data = []
with open(file_path, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in tqdm(reader, desc="Reading CSV"):
row = [r.strip() for r in row if r.strip()]
if not row:
with open(file_path, "r", encoding="utf-8") as f:
for line in tqdm(f, desc="Reading JSONL"):
line = line.strip()
if not line:
continue
claim = row[0]
events = row[1:]
obj = json.loads(line)
claim_text = obj.get("claim", "").strip()
claim_date = obj.get("date", "").strip()
events = obj.get("events", [])
if not claim_text:
continue
claim_id = generate_guid()
event_objects = []
for e in events:
event_text = e.get("Event", "").strip()
event_date = e.get("Date", "").strip()
if not event_text:
continue
event_objects.append({
"id": generate_guid(),
"text": e
"text": event_text,
"date": dateparser.parse(event_date)
})
data.append({
"claim": {
"id": claim_id,
"text": claim
"text": claim_text,
"date": dateparser.parse(claim_date)
},
"events": event_objects
})
return data
return data
def embed_texts(model, texts: List[str], desc="Embedding"):
embeddings = []
@@ -76,10 +88,10 @@ def main():
print("Loading model...")
model = SentenceTransformer(MODEL_NAME)
data = read_csv(INPUT_CSV)
data = read_jsonl(INPUT_CSV)
claim_texts, claim_ids = [], []
event_texts, event_ids = [], []
claim_texts, claim_ids, claim_dates = [], [], []
event_texts, event_ids, event_dates = [], [], []
raw_links = [] # temporary for cluster mapping
@@ -87,10 +99,12 @@ def main():
claim = entry["claim"]
claim_ids.append(claim["id"])
claim_texts.append(f"Claim: {claim['text']}")
claim_dates.append(claim['date'])
for event in entry["events"]:
event_ids.append(event["id"])
event_texts.append(f"Event: {event['text']}")
event_dates.append(event['date'])
raw_links.append({
"claim_id": claim["id"],
@@ -148,12 +162,12 @@ def main():
output = {
"claims": [
{"id": cid, "text": txt.replace("Claim: ", "")}
for cid, txt in zip(claim_ids, claim_texts)
{"id": cid, "text": txt.replace("Claim: ", ""), "date": str(dat)}
for cid, txt, dat in zip(claim_ids, claim_texts, claim_dates)
],
"events": [
{"id": eid, "text": txt.replace("Event: ", "")}
for eid, txt in zip(event_ids, event_texts)
{"id": eid, "text": txt.replace("Event: ", ""), "date": str(dat)}
for eid, txt, dat in zip(event_ids, event_texts, event_dates)
],
"claim_clusters": [
{"cluster_id": k, "members": v}