Add grph visualiser initial version
This commit is contained in:
@@ -13,17 +13,13 @@ from tqdm import tqdm
|
|||||||
INPUT_CSV = "../../data/dataset-dev.csv"
|
INPUT_CSV = "../../data/dataset-dev.csv"
|
||||||
OUTPUT_JSON = "../../data/clustered_output.json"
|
OUTPUT_JSON = "../../data/clustered_output.json"
|
||||||
MODEL_NAME = "all-MiniLM-L6-v2"
|
MODEL_NAME = "all-MiniLM-L6-v2"
|
||||||
SIMILARITY_THRESHOLD = 0.65
|
SIMILARITY_THRESHOLD = 0.55
|
||||||
|
|
||||||
def generate_guid():
|
def generate_guid():
|
||||||
return str(uuid.uuid4())
|
return str(uuid.uuid4())
|
||||||
|
|
||||||
|
|
||||||
def read_csv(file_path: str):
|
def read_csv(file_path: str):
|
||||||
"""
|
|
||||||
Expected format per row:
|
|
||||||
[claim, event1, event2, event3, ...]
|
|
||||||
"""
|
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
with open(file_path, newline='', encoding='utf-8') as f:
|
with open(file_path, newline='', encoding='utf-8') as f:
|
||||||
@@ -63,10 +59,7 @@ def embed_texts(model, texts: List[str], desc="Embedding"):
|
|||||||
return np.array(embeddings)
|
return np.array(embeddings)
|
||||||
|
|
||||||
|
|
||||||
def cluster_embeddings(embeddings, threshold=0.75, desc="Clustering"):
|
def cluster_embeddings(embeddings, threshold=0.75):
|
||||||
"""
|
|
||||||
Uses Agglomerative clustering with cosine distance
|
|
||||||
"""
|
|
||||||
distance_matrix = 1 - cosine_similarity(embeddings)
|
distance_matrix = 1 - cosine_similarity(embeddings)
|
||||||
|
|
||||||
clustering = AgglomerativeClustering(
|
clustering = AgglomerativeClustering(
|
||||||
@@ -85,68 +78,74 @@ def main():
|
|||||||
|
|
||||||
data = read_csv(INPUT_CSV)
|
data = read_csv(INPUT_CSV)
|
||||||
|
|
||||||
# Collect all claims and events separately
|
claim_texts, claim_ids = [], []
|
||||||
claim_texts = []
|
event_texts, event_ids = [], []
|
||||||
claim_ids = []
|
|
||||||
|
|
||||||
event_texts = []
|
raw_links = [] # temporary for cluster mapping
|
||||||
event_ids = []
|
|
||||||
|
|
||||||
links = [] # claim -> events
|
|
||||||
|
|
||||||
for entry in tqdm(data, desc="Processing rows"):
|
for entry in tqdm(data, desc="Processing rows"):
|
||||||
claim = entry["claim"]
|
claim = entry["claim"]
|
||||||
claim_ids.append(claim["id"])
|
claim_ids.append(claim["id"])
|
||||||
|
|
||||||
# Context-enhanced claim
|
|
||||||
claim_texts.append(f"Claim: {claim['text']}")
|
claim_texts.append(f"Claim: {claim['text']}")
|
||||||
|
|
||||||
for event in entry["events"]:
|
for event in entry["events"]:
|
||||||
event_ids.append(event["id"])
|
event_ids.append(event["id"])
|
||||||
|
|
||||||
# Context-enhanced event
|
|
||||||
event_texts.append(f"Event: {event['text']}")
|
event_texts.append(f"Event: {event['text']}")
|
||||||
|
|
||||||
links.append({
|
raw_links.append({
|
||||||
"claim_id": claim["id"],
|
"claim_id": claim["id"],
|
||||||
"event_id": event["id"]
|
"event_id": event["id"]
|
||||||
})
|
})
|
||||||
|
|
||||||
# Embed
|
|
||||||
print("Embedding claims...")
|
print("Embedding claims...")
|
||||||
claim_embeddings = embed_texts(model, claim_texts, desc="Claims")
|
claim_embeddings = embed_texts(model, claim_texts, desc="Claims")
|
||||||
|
|
||||||
print("Embedding events...")
|
print("Embedding events...")
|
||||||
event_embeddings = embed_texts(model, event_texts, desc="Events")
|
event_embeddings = embed_texts(model, event_texts, desc="Events")
|
||||||
|
|
||||||
# Cluster
|
|
||||||
print("Clustering claims...")
|
print("Clustering claims...")
|
||||||
claim_labels = cluster_embeddings(claim_embeddings, SIMILARITY_THRESHOLD)
|
claim_labels = cluster_embeddings(claim_embeddings, SIMILARITY_THRESHOLD)
|
||||||
|
|
||||||
print("Clustering events...")
|
print("Clustering events...")
|
||||||
event_labels = cluster_embeddings(event_embeddings, SIMILARITY_THRESHOLD)
|
event_labels = cluster_embeddings(event_embeddings, SIMILARITY_THRESHOLD)
|
||||||
|
|
||||||
# Build cluster structures
|
# Assign GUIDs to clusters
|
||||||
claim_clusters: Dict[int, List[str]] = {}
|
claim_cluster_map = {}
|
||||||
|
for label in set(claim_labels):
|
||||||
|
claim_cluster_map[int(label)] = generate_guid()
|
||||||
|
|
||||||
|
event_cluster_map = {}
|
||||||
|
for label in set(event_labels):
|
||||||
|
event_cluster_map[int(label)] = generate_guid()
|
||||||
|
|
||||||
|
# Build cluster membership
|
||||||
|
claim_clusters = {}
|
||||||
for cid, label in zip(claim_ids, claim_labels):
|
for cid, label in zip(claim_ids, claim_labels):
|
||||||
claim_clusters.setdefault(int(label), []).append(cid)
|
cluster_guid = claim_cluster_map[int(label)]
|
||||||
|
claim_clusters.setdefault(cluster_guid, []).append(cid)
|
||||||
|
|
||||||
event_clusters: Dict[int, List[str]] = {}
|
event_clusters = {}
|
||||||
for eid, label in zip(event_ids, event_labels):
|
for eid, label in zip(event_ids, event_labels):
|
||||||
event_clusters.setdefault(int(label), []).append(eid)
|
cluster_guid = event_cluster_map[int(label)]
|
||||||
|
event_clusters.setdefault(cluster_guid, []).append(eid)
|
||||||
|
|
||||||
# Build cluster-level links
|
# Build ONLY cluster-level links
|
||||||
cluster_links = []
|
cluster_links = set()
|
||||||
for link in links:
|
|
||||||
claim_cluster = int(claim_labels[claim_ids.index(link["claim_id"])])
|
|
||||||
event_cluster = int(event_labels[event_ids.index(link["event_id"])])
|
|
||||||
|
|
||||||
cluster_links.append({
|
for link in raw_links:
|
||||||
"claim_cluster": claim_cluster,
|
claim_label = int(claim_labels[claim_ids.index(link["claim_id"])])
|
||||||
"event_cluster": event_cluster
|
event_label = int(event_labels[event_ids.index(link["event_id"])])
|
||||||
})
|
|
||||||
|
claim_cluster_guid = claim_cluster_map[claim_label]
|
||||||
|
event_cluster_guid = event_cluster_map[event_label]
|
||||||
|
|
||||||
|
cluster_links.add((claim_cluster_guid, event_cluster_guid))
|
||||||
|
|
||||||
|
cluster_links = [
|
||||||
|
{"claim_cluster_id": c, "event_cluster_id": e}
|
||||||
|
for c, e in cluster_links
|
||||||
|
]
|
||||||
|
|
||||||
# Output structure
|
|
||||||
output = {
|
output = {
|
||||||
"claims": [
|
"claims": [
|
||||||
{"id": cid, "text": txt.replace("Claim: ", "")}
|
{"id": cid, "text": txt.replace("Claim: ", "")}
|
||||||
@@ -157,14 +156,13 @@ def main():
|
|||||||
for eid, txt in zip(event_ids, event_texts)
|
for eid, txt in zip(event_ids, event_texts)
|
||||||
],
|
],
|
||||||
"claim_clusters": [
|
"claim_clusters": [
|
||||||
{"cluster_id": int(k), "members": v}
|
{"cluster_id": k, "members": v}
|
||||||
for k, v in claim_clusters.items()
|
for k, v in claim_clusters.items()
|
||||||
],
|
],
|
||||||
"event_clusters": [
|
"event_clusters": [
|
||||||
{"cluster_id": int(k), "members": v}
|
{"cluster_id": k, "members": v}
|
||||||
for k, v in event_clusters.items()
|
for k, v in event_clusters.items()
|
||||||
],
|
],
|
||||||
"links": links,
|
|
||||||
"cluster_links": cluster_links
|
"cluster_links": cluster_links
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
.parcel-cache/
|
||||||
|
dist/
|
||||||
|
node_modules/
|
||||||
|
src/data.json
|
||||||
+3576
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,20 @@
|
|||||||
|
{
|
||||||
|
"name": "parcel-react-client-starter",
|
||||||
|
"private": true,
|
||||||
|
"version": "0.0.0",
|
||||||
|
"source": "src/index.html",
|
||||||
|
"scripts": {
|
||||||
|
"start": "parcel",
|
||||||
|
"build": "parcel build"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"react": "^19.2.5",
|
||||||
|
"react-dom": "^19.2.5",
|
||||||
|
"react-force-graph-2d": "^1.29.1"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@types/react": "^19.0.0",
|
||||||
|
"@types/react-dom": "^19.0.0",
|
||||||
|
"parcel": "^2.14.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
html {
|
||||||
|
color-scheme: light dark;
|
||||||
|
font-family: system-ui;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
height: 100%;
|
||||||
|
}
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
import React, { useEffect, useMemo, useState } from "react";
|
||||||
|
import ForceGraph2D from "react-force-graph-2d";
|
||||||
|
|
||||||
|
import data from "./data.json";
|
||||||
|
|
||||||
|
function buildGraph(data) {
|
||||||
|
const nodes = [];
|
||||||
|
const links = [];
|
||||||
|
|
||||||
|
const claimClusterMap = new Map();
|
||||||
|
const eventClusterMap = new Map();
|
||||||
|
|
||||||
|
// Build cluster nodes
|
||||||
|
data.claim_clusters.forEach((cluster) => {
|
||||||
|
const clusterNode = {
|
||||||
|
id: cluster.cluster_id,
|
||||||
|
label: cluster.title || "Unnamed Claim Cluster",
|
||||||
|
type: "claim_cluster",
|
||||||
|
members: cluster.members
|
||||||
|
};
|
||||||
|
nodes.push(clusterNode);
|
||||||
|
claimClusterMap.set(cluster.cluster_id, clusterNode);
|
||||||
|
});
|
||||||
|
|
||||||
|
data.event_clusters.forEach((cluster) => {
|
||||||
|
const clusterNode = {
|
||||||
|
id: cluster.cluster_id,
|
||||||
|
label: cluster.title || "Unnamed Event Cluster",
|
||||||
|
type: "event_cluster",
|
||||||
|
members: cluster.members
|
||||||
|
};
|
||||||
|
nodes.push(clusterNode);
|
||||||
|
eventClusterMap.set(cluster.cluster_id, clusterNode);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Build links between clusters
|
||||||
|
data.cluster_links.forEach((link) => {
|
||||||
|
links.push({ source: link.claim_cluster_id, target: link.event_cluster_id });
|
||||||
|
});
|
||||||
|
|
||||||
|
return { nodes, links };
|
||||||
|
}
|
||||||
|
|
||||||
|
export function App() {
|
||||||
|
const [selectedNode, setSelectedNode] = useState(null);
|
||||||
|
|
||||||
|
const graphData = useMemo(() => buildGraph(data), []);
|
||||||
|
function setNode(node) {
|
||||||
|
console.log(node)
|
||||||
|
setSelectedNode(node)
|
||||||
|
}
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<div>
|
||||||
|
<ForceGraph2D
|
||||||
|
graphData={graphData}
|
||||||
|
nodeLabel={(node) => node.label}
|
||||||
|
nodeAutoColorBy="type"
|
||||||
|
//linkDirectionalParticles={1}
|
||||||
|
//linkDirectionalParticleSpeed={0.002}
|
||||||
|
onNodeRightClick={(node) => setNode(node)}
|
||||||
|
nodeCanvasObject={(node, ctx, globalScale) => {
|
||||||
|
const fontSize = 12;
|
||||||
|
ctx.font = `${fontSize}px Sans-Serif`;
|
||||||
|
ctx.fillStyle = node.type.includes('claim') ? "blue" : "green";
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.arc(node.x, node.y, 2*node.members.length , 0, 2 * Math.PI, false);
|
||||||
|
ctx.fill();
|
||||||
|
|
||||||
|
if (node.members.length > 2) {
|
||||||
|
ctx.fillStyle = "black";
|
||||||
|
ctx.fillText(node.label, node.x + 12, node.y + 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style={{ position:"absolute", top:0, left:0 }}>
|
||||||
|
<h2>Details</h2>
|
||||||
|
{selectedNode ? (
|
||||||
|
<div>
|
||||||
|
<p><strong>ID:</strong> {selectedNode.id}</p>
|
||||||
|
<p><strong>Type:</strong> {selectedNode.type}</p>
|
||||||
|
<p><strong>Title / Label:</strong> {selectedNode.label}</p>
|
||||||
|
{selectedNode.members && (
|
||||||
|
<div>
|
||||||
|
<p><strong>Members:</strong></p>
|
||||||
|
<ul>
|
||||||
|
{selectedNode.members.map((m) => {
|
||||||
|
const memberData = data.claims.find(c => c.id === m) || data.events.find(e => e.id === m);
|
||||||
|
return <li key={m}>{memberData ? memberData.text : m}</li>;
|
||||||
|
})}
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p>Click a cluster node to see its members</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8"/>
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
|
<title>Parcel React App</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="app"></div>
|
||||||
|
<script type="module" src="index.tsx"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
import { createRoot } from 'react-dom/client';
|
||||||
|
import { StrictMode } from 'react';
|
||||||
|
import { App } from './App';
|
||||||
|
|
||||||
|
let container = document.getElementById("app")!;
|
||||||
|
let root = createRoot(container)
|
||||||
|
root.render(
|
||||||
|
<StrictMode>
|
||||||
|
<App />
|
||||||
|
</StrictMode>
|
||||||
|
);
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
{
|
||||||
|
"compilerOptions": {
|
||||||
|
/* Visit https://aka.ms/tsconfig to read more about this file */
|
||||||
|
"target": "ES2020",
|
||||||
|
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
||||||
|
"jsx": "react-jsx",
|
||||||
|
"useDefineForClassFields": true,
|
||||||
|
|
||||||
|
/* Modules */
|
||||||
|
"module": "ESNext",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
|
||||||
|
/* Emit */
|
||||||
|
"noEmit": true,
|
||||||
|
|
||||||
|
/* Interop Constraints */
|
||||||
|
"isolatedModules": true,
|
||||||
|
"allowSyntheticDefaultImports": true,
|
||||||
|
"allowImportingTsExtensions": true,
|
||||||
|
"esModuleInterop": true,
|
||||||
|
"forceConsistentCasingInFileNames": true,
|
||||||
|
|
||||||
|
/* Type Checking */
|
||||||
|
"strict": true,
|
||||||
|
|
||||||
|
/* Completeness */
|
||||||
|
"skipLibCheck": true
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user