From ea220e023c3c2a75b05649c8cd5186f2e6c9ecf1 Mon Sep 17 00:00:00 2001 From: William Jeynes Date: Fri, 24 Apr 2026 16:40:10 +0100 Subject: [PATCH] Add date ranges to frontend visualisation --- graphviz/frontend/.gitignore | 4 +- graphviz/frontend/src/App2.tsx | 335 +++++++++++++++++++ graphviz/frontend/src/index.tsx | 4 +- graphviz/processing/create_clusters.py | 56 ++-- graphviz/processing/process_clusters_time.py | 150 +++++++++ graphviz/processing/requirements.txt | 1 + 6 files changed, 526 insertions(+), 24 deletions(-) create mode 100644 graphviz/frontend/src/App2.tsx create mode 100644 graphviz/processing/process_clusters_time.py diff --git a/graphviz/frontend/.gitignore b/graphviz/frontend/.gitignore index a5e11f8..b4fc009 100644 --- a/graphviz/frontend/.gitignore +++ b/graphviz/frontend/.gitignore @@ -2,4 +2,6 @@ dist/ node_modules/ src/data.json -src/titles.json \ No newline at end of file +src/data_date.json +src/titles.json +src/titles_date.json \ No newline at end of file diff --git a/graphviz/frontend/src/App2.tsx b/graphviz/frontend/src/App2.tsx new file mode 100644 index 0000000..c0174b1 --- /dev/null +++ b/graphviz/frontend/src/App2.tsx @@ -0,0 +1,335 @@ +import React, { useEffect, useMemo, useRef, useState } from "react"; +import ForceGraph2D from "react-force-graph-2d"; +import * as d3 from "d3-force-3d"; + +import data from "./data_date.json"; +import titlesData from "./titles_date.json"; + +function drawRoundedRect(ctx, x, y, width, height, radius) { + const r = Math.min(radius, width / 2, height / 2); + + ctx.beginPath(); + ctx.moveTo(x + r, y); + ctx.lineTo(x + width - r, y); + ctx.quadraticCurveTo(x + width, y, x + width, y + r); + ctx.lineTo(x + width, y + height - r); + ctx.quadraticCurveTo(x + width, y + height, x + width - r, y + height); + ctx.lineTo(x + r, y + height); + ctx.quadraticCurveTo(x, y + height, x, y + height - r); + ctx.lineTo(x, y + r); + ctx.quadraticCurveTo(x, y, x + r, y); + ctx.closePath(); +} + +function parseDateSafe(dateStr) { + if (!dateStr) return null; + const d = new Date(dateStr); + if (isNaN(d.getTime())) return null; + if (d.getFullYear() < 2016) return null; // filter erroneous + return d; +} + +function monthsDiff(a, b) { + const ms = Math.abs(a - b); + return ms / (1000 * 60 * 60 * 24 * 30.44); +} + +function buildLookupMaps(data) { + const claimMap = new Map(data.claims.map(c => [c.id, c])); + const eventMap = new Map(data.events.map(e => [e.id, e])); + return { claimMap, eventMap }; +} + +function computeClusterAvgDate(members, claimMap, eventMap) { + const dates = []; + + members.forEach(id => { + const c = claimMap.get(id); + const e = eventMap.get(id); + + const raw = c?.date || e?.date; + const parsed = parseDateSafe(raw); + + if (parsed) dates.push(parsed.getTime()); + }); + + if (!dates.length) return null; + + const avg = dates.reduce((a, b) => a + b, 0) / dates.length; + return new Date(avg); +} + +function buildGraph(data) { + const nodes = []; + const links = []; + + const titleMap = new Map(titlesData.map(t => [t.cluster_id, t.title])); + const { claimMap, eventMap } = buildLookupMaps(data); + + data.claim_clusters.forEach((cluster) => { + const avgDate = computeClusterAvgDate(cluster.members, claimMap, eventMap); + + nodes.push({ + id: cluster.cluster_id, + label: titleMap.get(cluster.cluster_id) || cluster.title || "Unnamed Claim Cluster", + type: "claim_cluster", + members: cluster.members, + avgDate + }); + }); + + data.event_clusters.forEach((cluster) => { + const avgDate = computeClusterAvgDate(cluster.members, claimMap, eventMap); + + nodes.push({ + id: cluster.cluster_id, + label: titleMap.get(cluster.cluster_id) || cluster.title || "Unnamed Event Cluster", + type: "event_cluster", + members: cluster.members, + avgDate + }); + }); + + data.cluster_links.forEach((link) => { + links.push({ + source: link.claim_cluster_id, + target: link.event_cluster_id + }); + }); + + return { nodes, links }; +} + +function getConnectedComponents(nodes, links) { + const adj = new Map(); + nodes.forEach(n => adj.set(n.id, new Set())); + + links.forEach(l => { + adj.get(l.source)?.add(l.target); + adj.get(l.target)?.add(l.source); + }); + + const visited = new Set(); + const components = []; + + for (const node of nodes) { + if (visited.has(node.id)) continue; + + const stack = [node.id]; + const comp = []; + + while (stack.length) { + const id = stack.pop(); + if (visited.has(id)) continue; + + visited.add(id); + comp.push(id); + + adj.get(id)?.forEach(nei => { + if (!visited.has(nei)) stack.push(nei); + }); + } + + components.push(comp); + } + + return components; +} + +export function App2() { + const fgRef = useRef(); + const [selectedNode, setSelectedNode] = useState(null); + const [inputDate, setInputDate] = useState(""); + + const parsedInputDate = useMemo(() => { + const d = new Date(inputDate); + return isNaN(d.getTime()) ? null : d; + }, [inputDate]); + + const graphData = useMemo(() => { + const full = buildGraph(data); + const components = getConnectedComponents(full.nodes, full.links); + + const validIds = new Set( + components.filter(c => c.length > 1000).flat() + ); + + return { + nodes: full.nodes.filter(n => validIds.has(n.id)), + links: full.links.filter( + l => validIds.has(l.source) && validIds.has(l.target) + ) + }; + }, []); + + useEffect(() => { + if (!fgRef.current) return; + + fgRef.current.d3Force("charge", d3.forceManyBody().strength(-30)); + + fgRef.current.d3Force( + "link", + d3.forceLink().distance(140) + ); + + fgRef.current.d3Force( + "collision", + d3.forceCollide((node) => { + const dims = node.__bckgDimensions; + return dims ? Math.max(dims[0], dims[1]) / 2 + 32 : 40; + }) + ); + + fgRef.current.d3ReheatSimulation(); + }, [graphData]); + function isNodeHighlighted(node, referenceDate) { + if (!referenceDate || !node.avgDate) return false; + const diffMonths = Math.abs(referenceDate - node.avgDate) / (1000 * 60 * 60 * 24 * 30.44); + return diffMonths <= 6; + } + const highlightedNodeIds = useMemo(() => { + if (!parsedInputDate) return new Set(); + + const set = new Set(); + + graphData.nodes.forEach((n) => { + if (isNodeHighlighted(n, parsedInputDate)) { + set.add(n.id); + } + }); + + return set; + }, [graphData.nodes, parsedInputDate]); + + return ( +
+ node.label} + nodeAutoColorBy="type" + linkColor={(link) => { + const sourceId = + typeof link.source === "object" ? link.source.id : link.source; + + const targetId = + typeof link.target === "object" ? link.target.id : link.target; + + const bothHighlighted = + highlightedNodeIds.has(sourceId) && + highlightedNodeIds.has(targetId); + + return bothHighlighted ? "orange" : "white"; + }} + linkWidth={2.5} + onNodeClick={(node) => setSelectedNode(node)} + nodeCanvasObject={(node, ctx) => { + const label = node.label; + + const fontSize = 16 + 32 * Math.min(node.members.length, 5); + ctx.font = `${fontSize}px Sans-Serif`; + + const textWidth = ctx.measureText(label).width; + const padding = fontSize * 0.6; + + const width = textWidth + padding; + const height = fontSize + padding; + + const x = node.x - width / 2; + const y = node.y - height / 2; + + const radius = Math.min(10, fontSize * 0.6); + + let isHighlighted = false; + + if (parsedInputDate && node.avgDate) { + const diffMonths = monthsDiff(parsedInputDate, node.avgDate); + isHighlighted = diffMonths <= 6; + } + + ctx.fillStyle = node.type.includes("claim") + ? "blue" + : "green" + + if (isHighlighted) { + drawRoundedRect(ctx, x, y, width, height, radius); + ctx.fill(); + ctx.strokeStyle = "white"; + ctx.stroke(); + + ctx.textAlign = "center"; + ctx.textBaseline = "middle"; + ctx.fillStyle = "white"; + ctx.fillText(label, node.x, node.y); + + } + + + + node.__bckgDimensions = [width, height]; + node.__bckgPos = { x, y }; + }} + nodePointerAreaPaint={(node, color, ctx) => { + const dims = node.__bckgDimensions; + const pos = node.__bckgPos; + if (!dims || !pos) return; + + ctx.fillStyle = color; + drawRoundedRect(ctx, pos.x, pos.y, dims[0], dims[1], 6); + ctx.fill(); + }} + /> + +
+

FILTERS

+ + + +

Details

+ {selectedNode ? ( +
+

Title: {selectedNode.label}

+ + {selectedNode.members && ( +
+

Members:

+
    + {selectedNode.members.map((m) => { + const memberData = + data.claims.find((c) => c.id === m) || + data.events.find((e) => e.id === m); + + return ( +
  • + {memberData ? memberData.text : m} +
  • + ); + })} +
+
+ )} +
+ ) : ( +

Click a node to see details

+ )} +
+
+ ); +} \ No newline at end of file diff --git a/graphviz/frontend/src/index.tsx b/graphviz/frontend/src/index.tsx index e2d3e7b..fb3d0c2 100644 --- a/graphviz/frontend/src/index.tsx +++ b/graphviz/frontend/src/index.tsx @@ -1,11 +1,11 @@ import { createRoot } from 'react-dom/client'; import { StrictMode } from 'react'; -import { App } from './App'; +import { App2 } from './App2'; let container = document.getElementById("app")!; let root = createRoot(container) root.render( - + ); diff --git a/graphviz/processing/create_clusters.py b/graphviz/processing/create_clusters.py index a885501..1ba4c6a 100644 --- a/graphviz/processing/create_clusters.py +++ b/graphviz/processing/create_clusters.py @@ -1,8 +1,7 @@ -import csv import json import uuid from typing import List, Dict - +import dateparser import numpy as np from sentence_transformers import SentenceTransformer from sklearn.cluster import AgglomerativeClustering @@ -10,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm -INPUT_CSV = "../../data/dataset.csv" +INPUT_CSV = "../../data/dataset.jsonl" OUTPUT_JSON = "../../data/clustered_output.json" MODEL_NAME = "all-MiniLM-L6-v2" SIMILARITY_THRESHOLD = 0.8 @@ -19,37 +18,50 @@ def generate_guid(): return str(uuid.uuid4()) -def read_csv(file_path: str): +def read_jsonl(file_path: str): data = [] - with open(file_path, newline='', encoding='utf-8') as f: - reader = csv.reader(f) - for row in tqdm(reader, desc="Reading CSV"): - row = [r.strip() for r in row if r.strip()] - if not row: + with open(file_path, "r", encoding="utf-8") as f: + for line in tqdm(f, desc="Reading JSONL"): + line = line.strip() + if not line: continue - claim = row[0] - events = row[1:] + obj = json.loads(line) + + claim_text = obj.get("claim", "").strip() + claim_date = obj.get("date", "").strip() + events = obj.get("events", []) + + if not claim_text: + continue claim_id = generate_guid() event_objects = [] for e in events: + event_text = e.get("Event", "").strip() + event_date = e.get("Date", "").strip() + if not event_text: + continue + event_objects.append({ "id": generate_guid(), - "text": e + "text": event_text, + "date": dateparser.parse(event_date) }) data.append({ "claim": { "id": claim_id, - "text": claim + "text": claim_text, + "date": dateparser.parse(claim_date) }, "events": event_objects }) - return data + return data + def embed_texts(model, texts: List[str], desc="Embedding"): embeddings = [] @@ -76,10 +88,10 @@ def main(): print("Loading model...") model = SentenceTransformer(MODEL_NAME) - data = read_csv(INPUT_CSV) + data = read_jsonl(INPUT_CSV) - claim_texts, claim_ids = [], [] - event_texts, event_ids = [], [] + claim_texts, claim_ids, claim_dates = [], [], [] + event_texts, event_ids, event_dates = [], [], [] raw_links = [] # temporary for cluster mapping @@ -87,10 +99,12 @@ def main(): claim = entry["claim"] claim_ids.append(claim["id"]) claim_texts.append(f"Claim: {claim['text']}") + claim_dates.append(claim['date']) for event in entry["events"]: event_ids.append(event["id"]) event_texts.append(f"Event: {event['text']}") + event_dates.append(event['date']) raw_links.append({ "claim_id": claim["id"], @@ -148,12 +162,12 @@ def main(): output = { "claims": [ - {"id": cid, "text": txt.replace("Claim: ", "")} - for cid, txt in zip(claim_ids, claim_texts) + {"id": cid, "text": txt.replace("Claim: ", ""), "date": str(dat)} + for cid, txt, dat in zip(claim_ids, claim_texts, claim_dates) ], "events": [ - {"id": eid, "text": txt.replace("Event: ", "")} - for eid, txt in zip(event_ids, event_texts) + {"id": eid, "text": txt.replace("Event: ", ""), "date": str(dat)} + for eid, txt, dat in zip(event_ids, event_texts, event_dates) ], "claim_clusters": [ {"cluster_id": k, "members": v} diff --git a/graphviz/processing/process_clusters_time.py b/graphviz/processing/process_clusters_time.py new file mode 100644 index 0000000..c5e240a --- /dev/null +++ b/graphviz/processing/process_clusters_time.py @@ -0,0 +1,150 @@ +import json +from collections import defaultdict, deque +from openai import OpenAI +from tqdm import tqdm +from dotenv import load_dotenv +import os +from concurrent.futures import ThreadPoolExecutor, as_completed + +# ------------------------------- +# Load environment and OpenAI client +# ------------------------------- +load_dotenv() # Load environment variables from .env file +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +# ------------------------------- +# CONFIG +# ------------------------------- +INPUT_FILE = "../../data/clustered_output.json" # Your original JSON +OUTPUT_FILE = "../../data/clustered_output_time.json" # Output JSON file +OPENAI_MODEL = "gpt-5-nano" + +# ------------------------------- +# Load data +# ------------------------------- +with open(INPUT_FILE, "r") as f: + data = json.load(f) + +# ------------------------------- +# Prepare cluster sets +# ------------------------------- +claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]} +event_clusters = {e["cluster_id"] for e in data["event_clusters"]} +all_clusters = claim_clusters.union(event_clusters) + +# ------------------------------- +# Build graph +# ------------------------------- +graph = defaultdict(set) +for link in data.get("cluster_links", []): + c_id = link["claim_cluster_id"] + e_id = link["event_cluster_id"] + graph[c_id].add(e_id) + graph[e_id].add(c_id) + +for cid in all_clusters: + graph[cid] = graph[cid] + +# ------------------------------- +# Find connected components +# ------------------------------- +visited = set() +components = [] + +for node in graph: + if node not in visited: + queue = deque([node]) + component = set() + while queue: + current = queue.popleft() + if current in visited: + continue + visited.add(current) + component.add(current) + for neighbor in graph[current]: + if neighbor not in visited: + queue.append(neighbor) + components.append(component) + +# Filter components with size > 8 and < 50 +large_components = [c for c in components if len(c) > 1000] + +print("Connected components (size > 8):", len(large_components)) +print("Total clusters in those components:", sum(len(c) for c in large_components)) + +# ------------------------------- +# Prepare lookups +# ------------------------------- +claim_lookup = {c["id"]: c["text"] for c in data["claims"]} +event_lookup = {e["id"]: e["text"] for e in data["events"]} +claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]} +event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]} + +def extract_texts_for_cluster(cluster_id): + texts = [] + if cluster_id in claim_cluster_map: + texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cluster_id] if mid in claim_lookup]) + elif cluster_id in event_cluster_map: + texts.extend([event_lookup[mid] for mid in event_cluster_map[cluster_id] if mid in event_lookup]) + return texts + +# ------------------------------- +# GPT-based title generation +# ------------------------------- +def generate_title(texts): + prompt = ( + "Summarize the following texts into a concise 3 - 6 word title that captures the main theme:\n\n" + + "\n".join(f"- {t}" for t in texts) + + "\n\nTitle:" + ) + try: + # response = client.chat.completions.create( + # model=OPENAI_MODEL, + # messages=[ + # {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."}, + # {"role": "user", "content": prompt} + # ] + # ) + # title = response.choices[0].message.content.strip() + # if title.lower().startswith("title:"): + # title = title[6:].strip() + # return title + return "UNNAMED" + except Exception as e: + print("Error generating title:", e) + return "Untitled Cluster" + +# ------------------------------- +# Wrapper for parallel execution +# ------------------------------- +def generate_title_for_cluster(cluster_id): + texts = extract_texts_for_cluster(cluster_id) + title = generate_title(texts) + return {"cluster_id": cluster_id, "title": title} + +# ------------------------------- +# Generate titles in parallel +# ------------------------------- +clusters_in_large_components = [cid for comp in large_components for cid in comp] +output = [] + +print("\nGenerating GPT titles for clusters (parallel)...") + +with ThreadPoolExecutor(max_workers=10) as executor: + future_to_cluster = {executor.submit(generate_title_for_cluster, cid): cid for cid in clusters_in_large_components} + for future in tqdm(as_completed(future_to_cluster), total=len(clusters_in_large_components), desc="Clusters", ncols=100): + try: + result = future.result() + output.append(result) + except Exception as e: + cid = future_to_cluster[future] + print(f"Error processing cluster {cid}: {e}") + output.append({"cluster_id": cid, "title": "Untitled Cluster"}) + +# ------------------------------- +# Save JSON +# ------------------------------- +with open(OUTPUT_FILE, "w") as f: + json.dump(output, f, indent=2) + +print(f"\nSaved cluster titles to {OUTPUT_FILE}") \ No newline at end of file diff --git a/graphviz/processing/requirements.txt b/graphviz/processing/requirements.txt index 470db3c..e05b441 100644 --- a/graphviz/processing/requirements.txt +++ b/graphviz/processing/requirements.txt @@ -1 +1,2 @@ sentence_transformers +dateparser \ No newline at end of file