Add date ranges to frontend visualisation

2026-04-24 16:40:10 +01:00
parent f5f8800173
commit ea220e023c
6 changed files with 526 additions and 24 deletions
@@ -1,8 +1,7 @@
-import csv
 import json
 import uuid
 from typing import List, Dict
-
+import dateparser
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from sklearn.cluster import AgglomerativeClustering
@@ -10,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 from tqdm import tqdm


-INPUT_CSV = "../../data/dataset.csv"
+INPUT_CSV = "../../data/dataset.jsonl"
 OUTPUT_JSON = "../../data/clustered_output.json"
 MODEL_NAME = "all-MiniLM-L6-v2"
 SIMILARITY_THRESHOLD = 0.8
@@ -19,37 +18,50 @@ def generate_guid():
    return str(uuid.uuid4())


-def read_csv(file_path: str):
+def read_jsonl(file_path: str):
    data = []

-    with open(file_path, newline='', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        for row in tqdm(reader, desc="Reading CSV"):
-            row = [r.strip() for r in row if r.strip()]
-            if not row:
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in tqdm(f, desc="Reading JSONL"):
+            line = line.strip()
+            if not line:
                continue

-            claim = row[0]
-            events = row[1:]
+            obj = json.loads(line)
+
+            claim_text = obj.get("claim", "").strip()
+            claim_date = obj.get("date", "").strip()
+            events = obj.get("events", [])
+
+            if not claim_text:
+                continue

            claim_id = generate_guid()

            event_objects = []
            for e in events:
+                event_text = e.get("Event", "").strip()
+                event_date = e.get("Date", "").strip()
+                if not event_text:
+                    continue
+
                event_objects.append({
                    "id": generate_guid(),
-                    "text": e
+                    "text": event_text,
+                    "date": dateparser.parse(event_date)
                })

            data.append({
                "claim": {
                    "id": claim_id,
-                    "text": claim
+                    "text": claim_text,
+                    "date": dateparser.parse(claim_date)
                },
                "events": event_objects
            })

-    return data
+        return data
+

 def embed_texts(model, texts: List[str], desc="Embedding"):
    embeddings = []
@@ -76,10 +88,10 @@ def main():
    print("Loading model...")
    model = SentenceTransformer(MODEL_NAME)

-    data = read_csv(INPUT_CSV)
+    data = read_jsonl(INPUT_CSV)

-    claim_texts, claim_ids = [], []
-    event_texts, event_ids = [], []
+    claim_texts, claim_ids, claim_dates = [], [], []
+    event_texts, event_ids, event_dates = [], [], []

    raw_links = []  # temporary for cluster mapping

@@ -87,10 +99,12 @@ def main():
        claim = entry["claim"]
        claim_ids.append(claim["id"])
        claim_texts.append(f"Claim: {claim['text']}")
+        claim_dates.append(claim['date'])

        for event in entry["events"]:
            event_ids.append(event["id"])
            event_texts.append(f"Event: {event['text']}")
+            event_dates.append(event['date'])

            raw_links.append({
                "claim_id": claim["id"],
@@ -148,12 +162,12 @@ def main():

    output = {
        "claims": [
-            {"id": cid, "text": txt.replace("Claim: ", "")}
-            for cid, txt in zip(claim_ids, claim_texts)
+            {"id": cid, "text": txt.replace("Claim: ", ""), "date": str(dat)}
+            for cid, txt, dat in zip(claim_ids, claim_texts, claim_dates)
        ],
        "events": [
-            {"id": eid, "text": txt.replace("Event: ", "")}
-            for eid, txt in zip(event_ids, event_texts)
+            {"id": eid, "text": txt.replace("Event: ", ""), "date": str(dat)}
+            for eid, txt, dat in zip(event_ids, event_texts, event_dates)
        ],
        "claim_clusters": [
            {"cluster_id": k, "members": v}