Add grph visualiser initial version

This commit is contained in:
William Jeynes
2026-04-08 21:00:24 +01:00
parent cf6b29ca91
commit aa962b1802
9 changed files with 3803 additions and 41 deletions
+39 -41
View File
@@ -13,17 +13,13 @@ from tqdm import tqdm
INPUT_CSV = "../../data/dataset-dev.csv" INPUT_CSV = "../../data/dataset-dev.csv"
OUTPUT_JSON = "../../data/clustered_output.json" OUTPUT_JSON = "../../data/clustered_output.json"
MODEL_NAME = "all-MiniLM-L6-v2" MODEL_NAME = "all-MiniLM-L6-v2"
SIMILARITY_THRESHOLD = 0.65 SIMILARITY_THRESHOLD = 0.55
def generate_guid(): def generate_guid():
return str(uuid.uuid4()) return str(uuid.uuid4())
def read_csv(file_path: str): def read_csv(file_path: str):
"""
Expected format per row:
[claim, event1, event2, event3, ...]
"""
data = [] data = []
with open(file_path, newline='', encoding='utf-8') as f: with open(file_path, newline='', encoding='utf-8') as f:
@@ -63,10 +59,7 @@ def embed_texts(model, texts: List[str], desc="Embedding"):
return np.array(embeddings) return np.array(embeddings)
def cluster_embeddings(embeddings, threshold=0.75, desc="Clustering"): def cluster_embeddings(embeddings, threshold=0.75):
"""
Uses Agglomerative clustering with cosine distance
"""
distance_matrix = 1 - cosine_similarity(embeddings) distance_matrix = 1 - cosine_similarity(embeddings)
clustering = AgglomerativeClustering( clustering = AgglomerativeClustering(
@@ -85,68 +78,74 @@ def main():
data = read_csv(INPUT_CSV) data = read_csv(INPUT_CSV)
# Collect all claims and events separately claim_texts, claim_ids = [], []
claim_texts = [] event_texts, event_ids = [], []
claim_ids = []
event_texts = [] raw_links = [] # temporary for cluster mapping
event_ids = []
links = [] # claim -> events
for entry in tqdm(data, desc="Processing rows"): for entry in tqdm(data, desc="Processing rows"):
claim = entry["claim"] claim = entry["claim"]
claim_ids.append(claim["id"]) claim_ids.append(claim["id"])
# Context-enhanced claim
claim_texts.append(f"Claim: {claim['text']}") claim_texts.append(f"Claim: {claim['text']}")
for event in entry["events"]: for event in entry["events"]:
event_ids.append(event["id"]) event_ids.append(event["id"])
# Context-enhanced event
event_texts.append(f"Event: {event['text']}") event_texts.append(f"Event: {event['text']}")
links.append({ raw_links.append({
"claim_id": claim["id"], "claim_id": claim["id"],
"event_id": event["id"] "event_id": event["id"]
}) })
# Embed
print("Embedding claims...") print("Embedding claims...")
claim_embeddings = embed_texts(model, claim_texts, desc="Claims") claim_embeddings = embed_texts(model, claim_texts, desc="Claims")
print("Embedding events...") print("Embedding events...")
event_embeddings = embed_texts(model, event_texts, desc="Events") event_embeddings = embed_texts(model, event_texts, desc="Events")
# Cluster
print("Clustering claims...") print("Clustering claims...")
claim_labels = cluster_embeddings(claim_embeddings, SIMILARITY_THRESHOLD) claim_labels = cluster_embeddings(claim_embeddings, SIMILARITY_THRESHOLD)
print("Clustering events...") print("Clustering events...")
event_labels = cluster_embeddings(event_embeddings, SIMILARITY_THRESHOLD) event_labels = cluster_embeddings(event_embeddings, SIMILARITY_THRESHOLD)
# Build cluster structures # Assign GUIDs to clusters
claim_clusters: Dict[int, List[str]] = {} claim_cluster_map = {}
for label in set(claim_labels):
claim_cluster_map[int(label)] = generate_guid()
event_cluster_map = {}
for label in set(event_labels):
event_cluster_map[int(label)] = generate_guid()
# Build cluster membership
claim_clusters = {}
for cid, label in zip(claim_ids, claim_labels): for cid, label in zip(claim_ids, claim_labels):
claim_clusters.setdefault(int(label), []).append(cid) cluster_guid = claim_cluster_map[int(label)]
claim_clusters.setdefault(cluster_guid, []).append(cid)
event_clusters: Dict[int, List[str]] = {} event_clusters = {}
for eid, label in zip(event_ids, event_labels): for eid, label in zip(event_ids, event_labels):
event_clusters.setdefault(int(label), []).append(eid) cluster_guid = event_cluster_map[int(label)]
event_clusters.setdefault(cluster_guid, []).append(eid)
# Build cluster-level links # Build ONLY cluster-level links
cluster_links = [] cluster_links = set()
for link in links:
claim_cluster = int(claim_labels[claim_ids.index(link["claim_id"])])
event_cluster = int(event_labels[event_ids.index(link["event_id"])])
cluster_links.append({ for link in raw_links:
"claim_cluster": claim_cluster, claim_label = int(claim_labels[claim_ids.index(link["claim_id"])])
"event_cluster": event_cluster event_label = int(event_labels[event_ids.index(link["event_id"])])
})
claim_cluster_guid = claim_cluster_map[claim_label]
event_cluster_guid = event_cluster_map[event_label]
cluster_links.add((claim_cluster_guid, event_cluster_guid))
cluster_links = [
{"claim_cluster_id": c, "event_cluster_id": e}
for c, e in cluster_links
]
# Output structure
output = { output = {
"claims": [ "claims": [
{"id": cid, "text": txt.replace("Claim: ", "")} {"id": cid, "text": txt.replace("Claim: ", "")}
@@ -157,14 +156,13 @@ def main():
for eid, txt in zip(event_ids, event_texts) for eid, txt in zip(event_ids, event_texts)
], ],
"claim_clusters": [ "claim_clusters": [
{"cluster_id": int(k), "members": v} {"cluster_id": k, "members": v}
for k, v in claim_clusters.items() for k, v in claim_clusters.items()
], ],
"event_clusters": [ "event_clusters": [
{"cluster_id": int(k), "members": v} {"cluster_id": k, "members": v}
for k, v in event_clusters.items() for k, v in event_clusters.items()
], ],
"links": links,
"cluster_links": cluster_links "cluster_links": cluster_links
} }
+4
View File
@@ -0,0 +1,4 @@
.parcel-cache/
dist/
node_modules/
src/data.json
File diff suppressed because it is too large Load Diff
+20
View File
@@ -0,0 +1,20 @@
{
"name": "parcel-react-client-starter",
"private": true,
"version": "0.0.0",
"source": "src/index.html",
"scripts": {
"start": "parcel",
"build": "parcel build"
},
"dependencies": {
"react": "^19.2.5",
"react-dom": "^19.2.5",
"react-force-graph-2d": "^1.29.1"
},
"devDependencies": {
"@types/react": "^19.0.0",
"@types/react-dom": "^19.0.0",
"parcel": "^2.14.0"
}
}
+8
View File
@@ -0,0 +1,8 @@
html {
color-scheme: light dark;
font-family: system-ui;
display: flex;
align-items: center;
justify-content: center;
height: 100%;
}
+104
View File
@@ -0,0 +1,104 @@
import React, { useEffect, useMemo, useState } from "react";
import ForceGraph2D from "react-force-graph-2d";
import data from "./data.json";
function buildGraph(data) {
const nodes = [];
const links = [];
const claimClusterMap = new Map();
const eventClusterMap = new Map();
// Build cluster nodes
data.claim_clusters.forEach((cluster) => {
const clusterNode = {
id: cluster.cluster_id,
label: cluster.title || "Unnamed Claim Cluster",
type: "claim_cluster",
members: cluster.members
};
nodes.push(clusterNode);
claimClusterMap.set(cluster.cluster_id, clusterNode);
});
data.event_clusters.forEach((cluster) => {
const clusterNode = {
id: cluster.cluster_id,
label: cluster.title || "Unnamed Event Cluster",
type: "event_cluster",
members: cluster.members
};
nodes.push(clusterNode);
eventClusterMap.set(cluster.cluster_id, clusterNode);
});
// Build links between clusters
data.cluster_links.forEach((link) => {
links.push({ source: link.claim_cluster_id, target: link.event_cluster_id });
});
return { nodes, links };
}
export function App() {
const [selectedNode, setSelectedNode] = useState(null);
const graphData = useMemo(() => buildGraph(data), []);
function setNode(node) {
console.log(node)
setSelectedNode(node)
}
return (
<div>
<div>
<ForceGraph2D
graphData={graphData}
nodeLabel={(node) => node.label}
nodeAutoColorBy="type"
//linkDirectionalParticles={1}
//linkDirectionalParticleSpeed={0.002}
onNodeRightClick={(node) => setNode(node)}
nodeCanvasObject={(node, ctx, globalScale) => {
const fontSize = 12;
ctx.font = `${fontSize}px Sans-Serif`;
ctx.fillStyle = node.type.includes('claim') ? "blue" : "green";
ctx.beginPath();
ctx.arc(node.x, node.y, 2*node.members.length , 0, 2 * Math.PI, false);
ctx.fill();
if (node.members.length > 2) {
ctx.fillStyle = "black";
ctx.fillText(node.label, node.x + 12, node.y + 4);
}
}}
/>
</div>
<div style={{ position:"absolute", top:0, left:0 }}>
<h2>Details</h2>
{selectedNode ? (
<div>
<p><strong>ID:</strong> {selectedNode.id}</p>
<p><strong>Type:</strong> {selectedNode.type}</p>
<p><strong>Title / Label:</strong> {selectedNode.label}</p>
{selectedNode.members && (
<div>
<p><strong>Members:</strong></p>
<ul>
{selectedNode.members.map((m) => {
const memberData = data.claims.find(c => c.id === m) || data.events.find(e => e.id === m);
return <li key={m}>{memberData ? memberData.text : m}</li>;
})}
</ul>
</div>
)}
</div>
) : (
<p>Click a cluster node to see its members</p>
)}
</div>
</div>
);
}
@@ -0,0 +1,12 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Parcel React App</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="index.tsx"></script>
</body>
</html>
@@ -0,0 +1,11 @@
import { createRoot } from 'react-dom/client';
import { StrictMode } from 'react';
import { App } from './App';
let container = document.getElementById("app")!;
let root = createRoot(container)
root.render(
<StrictMode>
<App />
</StrictMode>
);
@@ -0,0 +1,29 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
"target": "ES2020",
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"jsx": "react-jsx",
"useDefineForClassFields": true,
/* Modules */
"module": "ESNext",
"moduleResolution": "bundler",
/* Emit */
"noEmit": true,
/* Interop Constraints */
"isolatedModules": true,
"allowSyntheticDefaultImports": true,
"allowImportingTsExtensions": true,
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
/* Type Checking */
"strict": true,
/* Completeness */
"skipLibCheck": true
}
}