diff --git a/graphviz/frontend/src/App.tsx b/graphviz/frontend/src/App.tsx index 3bd1b6a..be6d512 100644 --- a/graphviz/frontend/src/App.tsx +++ b/graphviz/frontend/src/App.tsx @@ -52,11 +52,71 @@ function buildGraph(data) { return { nodes, links }; } +function getConnectedComponents(nodes, links) { + const adj = new Map(); + + nodes.forEach(n => adj.set(n.id, new Set())); + + links.forEach(l => { + adj.get(l.source)?.add(l.target); + adj.get(l.target)?.add(l.source); + }); + + const visited = new Set(); + const components = []; + + for (const node of nodes) { + if (visited.has(node.id)) continue; + + const stack = [node.id]; + const comp = []; + + while (stack.length) { + const id = stack.pop(); + if (visited.has(id)) continue; + + visited.add(id); + comp.push(id); + + adj.get(id)?.forEach(nei => { + if (!visited.has(nei)) stack.push(nei); + }); + } + + components.push(comp); + } + + return components; +} + export function App() { const fgRef = useRef(); const [selectedNode, setSelectedNode] = useState(null); + const [minGraphSize, setMinGraphSize] = useState(10); - const graphData = useMemo(() => buildGraph(data), []); + const graphData = useMemo(() => { + const full = buildGraph(data); + + const components = getConnectedComponents(full.nodes, full.links); + + // keep only components large enough + const validIds = new Set( + components + .filter(comp => comp.length >= minGraphSize && comp.length < 50) + .flat() + ); + + const filteredNodes = full.nodes.filter(n => validIds.has(n.id)); + + const filteredLinks = full.links.filter( + l => validIds.has(l.source) && validIds.has(l.target) + ); + + return { + nodes: filteredNodes, + links: filteredLinks + }; + }, [minGraphSize]); useEffect(() => { if (!fgRef.current) return; @@ -83,7 +143,7 @@ export function App() { ); fgRef.current.d3ReheatSimulation(); - }, []); + }, [graphData]); return (
ID: {selectedNode.id}
-Type: {selectedNode.type}
Title: {selectedNode.label}
{selectedNode.members && ( diff --git a/graphviz/processing/create_clusters.py b/graphviz/processing/create_clusters.py index 5d4d098..a885501 100644 --- a/graphviz/processing/create_clusters.py +++ b/graphviz/processing/create_clusters.py @@ -10,10 +10,10 @@ from sklearn.metrics.pairwise import cosine_similarity from tqdm import tqdm -INPUT_CSV = "../../data/dataset-dev.csv" +INPUT_CSV = "../../data/dataset.csv" OUTPUT_JSON = "../../data/clustered_output.json" MODEL_NAME = "all-MiniLM-L6-v2" -SIMILARITY_THRESHOLD = 0.65 +SIMILARITY_THRESHOLD = 0.8 def generate_guid(): return str(uuid.uuid4()) diff --git a/graphviz/processing/process_clusters.py b/graphviz/processing/process_clusters.py new file mode 100644 index 0000000..fc7051f --- /dev/null +++ b/graphviz/processing/process_clusters.py @@ -0,0 +1,119 @@ +import json +from collections import defaultdict, deque + +# ------------------------------- +# CONFIG +# ------------------------------- +INPUT_FILE = "../../data/clustered_output.json" # Your original JSON +OUTPUT_FILE = "../../data/clustered_output2.json" # Output JSON file + +# ------------------------------- +# Load data +# ------------------------------- +with open(INPUT_FILE, "r") as f: + data = json.load(f) + +# ------------------------------- +# Prepare cluster sets +# ------------------------------- +claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]} +event_clusters = {e["cluster_id"] for e in data["event_clusters"]} +all_clusters = claim_clusters.union(event_clusters) + +# ------------------------------- +# Build graph from cluster links +# ------------------------------- +graph = defaultdict(set) +for link in data.get("cluster_links", []): + c_id = link["claim_cluster_id"] + e_id = link["event_cluster_id"] + graph[c_id].add(e_id) + graph[e_id].add(c_id) + +# Make sure all clusters appear in graph (even isolated ones) +for cid in all_clusters: + graph[cid] = graph[cid] + +# ------------------------------- +# Find connected components +# ------------------------------- +visited = set() +components = [] + +for node in graph: + if node not in visited: + queue = deque([node]) + component = set() + while queue: + current = queue.popleft() + if current in visited: + continue + visited.add(current) + component.add(current) + for neighbor in graph[current]: + if neighbor not in visited: + queue.append(neighbor) + components.append(component) + +# Filter components with size > 8 +large_components = [c for c in components if len(c) > 8 and len(c) < 50] + +# ------------------------------- +# Output stats +# ------------------------------- +num_components = len(large_components) +num_nodes = sum(len(c) for c in large_components) + +print("Connected components (size > 8):", num_components) +print("Total clusters in those components:", num_nodes) + +# ------------------------------- +# Prepare lookup tables +# ------------------------------- +claim_lookup = {c["id"]: c["text"] for c in data["claims"]} +event_lookup = {e["id"]: e["text"] for e in data["events"]} + +claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]} +event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]} + +def extract_texts(component): + texts = [] + for cid in component: + if cid in claim_cluster_map: + texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cid] if mid in claim_lookup]) + elif cid in event_cluster_map: + texts.extend([event_lookup[mid] for mid in event_cluster_map[cid] if mid in event_lookup]) + return texts + +# ------------------------------- +# Optional: Generate titles +# ------------------------------- +user_input = input("Generate titles for each component? (y/n): ") + +if user_input.lower() == "y": + output = [] + + for i, comp in enumerate(large_components): + texts = extract_texts(comp) + + # Show a few sample texts + print(f"\nComponent {i} sample texts:") + for t in texts[:5]: + print("-", t) + + # Ask user for a 3-5 word title (could be automated with OpenAI API) + title = input("Enter 3-5 word title: ") + + output.append({ + "component_id": i, + "cluster_ids": list(comp), + "title": title + }) + + # Save JSON + with open(OUTPUT_FILE, "w") as f: + json.dump(output, f, indent=2) + + print(f"Saved cluster titles to {OUTPUT_FILE}") +else: + print("No titles generated. Script finished.") \ No newline at end of file