diff --git a/graphviz/frontend/.gitignore b/graphviz/frontend/.gitignore index a3a8c2a..a5e11f8 100644 --- a/graphviz/frontend/.gitignore +++ b/graphviz/frontend/.gitignore @@ -1,4 +1,5 @@ .parcel-cache/ dist/ node_modules/ -src/data.json \ No newline at end of file +src/data.json +src/titles.json \ No newline at end of file diff --git a/graphviz/frontend/src/App.tsx b/graphviz/frontend/src/App.tsx index be6d512..4022174 100644 --- a/graphviz/frontend/src/App.tsx +++ b/graphviz/frontend/src/App.tsx @@ -3,6 +3,7 @@ import ForceGraph2D from "react-force-graph-2d"; import * as d3 from "d3-force-3d"; import data from "./data.json"; +import titlesData from "./titles.json"; function drawRoundedRect(ctx, x, y, width, height, radius) { const r = Math.min(radius, width / 2, height / 2); @@ -24,10 +25,13 @@ function buildGraph(data) { const nodes = []; const links = []; + // Create a lookup map for quick access + const titleMap = new Map(titlesData.map(t => [t.cluster_id, t.title])); + data.claim_clusters.forEach((cluster) => { nodes.push({ id: cluster.cluster_id, - label: cluster.title || "Unnamed Claim Cluster", + label: titleMap.get(cluster.cluster_id) || cluster.title || "Unnamed Claim Cluster", type: "claim_cluster", members: cluster.members }); @@ -36,7 +40,7 @@ function buildGraph(data) { data.event_clusters.forEach((cluster) => { nodes.push({ id: cluster.cluster_id, - label: cluster.title || "Unnamed Event Cluster", + label: titleMap.get(cluster.cluster_id) || cluster.title || "Unnamed Event Cluster", type: "event_cluster", members: cluster.members }); @@ -124,13 +128,15 @@ export function App() { // Stronger repulsion fgRef.current.d3Force( "charge", - d3.forceManyBody().strength(-3000) + d3.forceManyBody().strength(-10000) ); + + // Link distance fgRef.current.d3Force( "link", - d3.forceLink().distance(240) + d3.forceLink().distance(140) ); // Collision based on dynamic box size @@ -138,7 +144,7 @@ export function App() { "collision", d3.forceCollide((node) => { const dims = node.__bckgDimensions; - return dims ? Math.max(dims[0], dims[1]) / 2 + 16 : 20; + return dims ? Math.max(dims[0], dims[1]) / 2 + 32 : 40; }) ); @@ -224,7 +230,7 @@ export function App() {
setMinGraphSize(Number(e.target.value))} diff --git a/graphviz/processing/process_clusters.py b/graphviz/processing/process_clusters.py index d62d7e6..250211e 100644 --- a/graphviz/processing/process_clusters.py +++ b/graphviz/processing/process_clusters.py @@ -4,17 +4,21 @@ from openai import OpenAI from tqdm import tqdm from dotenv import load_dotenv import os +from concurrent.futures import ThreadPoolExecutor, as_completed +# ------------------------------- +# Load environment and OpenAI client +# ------------------------------- load_dotenv() # Load environment variables from .env file client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + # ------------------------------- # CONFIG # ------------------------------- INPUT_FILE = "../../data/clustered_output.json" # Your original JSON -OUTPUT_FILE = "../../data/clustered_output2.json" # Output JSON file +OUTPUT_FILE = "../../data/clustered_output2.json" # Output JSON file OPENAI_MODEL = "gpt-5-nano" - # ------------------------------- # Load data # ------------------------------- @@ -62,8 +66,8 @@ for node in graph: queue.append(neighbor) components.append(component) -# Filter components with size > 8 -large_components = [c for c in components if len(c) > 8 and len(c) < 50] +# Filter components with size > 8 and < 50 +large_components = [c for c in components if 8 < len(c) < 50] print("Connected components (size > 8):", len(large_components)) print("Total clusters in those components:", sum(len(c) for c in large_components)) @@ -94,13 +98,14 @@ def generate_title(texts): "\n\nTitle:" ) try: - response = client.chat.completions.create(model=OPENAI_MODEL, - messages=[ - {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."}, - {"role": "user", "content": prompt} - ]) + response = client.chat.completions.create( + model=OPENAI_MODEL, + messages=[ + {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."}, + {"role": "user", "content": prompt} + ] + ) title = response.choices[0].message.content.strip() - if title.lower().startswith("title:"): title = title[6:].strip() return title @@ -109,19 +114,31 @@ def generate_title(texts): return "Untitled Cluster" # ------------------------------- -# Generate title per cluster with progress bar +# Wrapper for parallel execution +# ------------------------------- +def generate_title_for_cluster(cluster_id): + texts = extract_texts_for_cluster(cluster_id) + title = generate_title(texts) + return {"cluster_id": cluster_id, "title": title} + +# ------------------------------- +# Generate titles in parallel # ------------------------------- clusters_in_large_components = [cid for comp in large_components for cid in comp] output = [] -print("\nGenerating GPT titles for clusters...") -for cluster_id in tqdm(clusters_in_large_components, desc="Clusters", ncols=100): - texts = extract_texts_for_cluster(cluster_id) - title = generate_title(texts) - output.append({ - "cluster_id": cluster_id, - "title": title - }) +print("\nGenerating GPT titles for clusters (parallel)...") + +with ThreadPoolExecutor(max_workers=10) as executor: + future_to_cluster = {executor.submit(generate_title_for_cluster, cid): cid for cid in clusters_in_large_components} + for future in tqdm(as_completed(future_to_cluster), total=len(clusters_in_large_components), desc="Clusters", ncols=100): + try: + result = future.result() + output.append(result) + except Exception as e: + cid = future_to_cluster[future] + print(f"Error processing cluster {cid}: {e}") + output.append({"cluster_id": cid, "title": "Untitled Cluster"}) # ------------------------------- # Save JSON