Add create clusters init vers

2026-04-09 14:25:43 +01:00
parent ac49351425
commit 2326e61457
3 changed files with 200 additions and 10 deletions
@@ -52,11 +52,71 @@ function buildGraph(data) {
  return { nodes, links };
 }
 function getConnectedComponents(nodes, links) {
  const adj = new Map();
  nodes.forEach(n => adj.set(n.id, new Set()));
  links.forEach(l => {
    adj.get(l.source)?.add(l.target);
    adj.get(l.target)?.add(l.source);
  });
  const visited = new Set();
  const components = [];
  for (const node of nodes) {
    if (visited.has(node.id)) continue;
    const stack = [node.id];
    const comp = [];
    while (stack.length) {
      const id = stack.pop();
      if (visited.has(id)) continue;
      visited.add(id);
      comp.push(id);
      adj.get(id)?.forEach(nei => {
        if (!visited.has(nei)) stack.push(nei);
      });
    }
    components.push(comp);
  }
  return components;
 }
 export function App() {
  const fgRef = useRef();
  const [selectedNode, setSelectedNode] = useState(null);
  const [minGraphSize, setMinGraphSize] = useState(10);
-  const graphData = useMemo(() => buildGraph(data), []);
+  const graphData = useMemo(() => {
    const full = buildGraph(data);
    const components = getConnectedComponents(full.nodes, full.links);
    // keep only components large enough
    const validIds = new Set(
      components
        .filter(comp => comp.length >= minGraphSize && comp.length < 50)
        .flat()
    );
    const filteredNodes = full.nodes.filter(n => validIds.has(n.id));
    const filteredLinks = full.links.filter(
      l => validIds.has(l.source) && validIds.has(l.target)
    );
    return {
      nodes: filteredNodes,
      links: filteredLinks
    };
  }, [minGraphSize]);
  useEffect(() => {
    if (!fgRef.current) return;
@@ -83,7 +143,7 @@ export function App() {
    );
    fgRef.current.d3ReheatSimulation();
-  }, []);
+  }, [graphData]);
  return (
    <div>
@@ -149,19 +209,30 @@ export function App() {
      <div
        style={{
          position: "absolute",
-          top: 0,
+          top: "10px",
-          right: 0,
+          right: "10px",
          borderRadius: "3px",
          backgroundColor: "gray",
-          padding: "10px",
+          padding: "20px",
-          maxWidth: "300px"
+          maxWidth: "500px"
        }}
      >
         <h2>Config</h2>
          <label>
            Min connected graph size: <strong>{minGraphSize}</strong>
          </label>
          <br />
          <input
            type="range"
            min="8"
            max="49"
            value={minGraphSize}
            onChange={(e) => setMinGraphSize(Number(e.target.value))}
          />
        <h2>Details</h2>
        {selectedNode ? (
          <div>
            <p><strong>ID:</strong> {selectedNode.id}</p>
            <p><strong>Type:</strong> {selectedNode.type}</p>
            <p><strong>Title:</strong> {selectedNode.label}</p>
            {selectedNode.members && (
@@ -10,10 +10,10 @@ from sklearn.metrics.pairwise import cosine_similarity
 from tqdm import tqdm
-INPUT_CSV = "../../data/dataset-dev.csv"
+INPUT_CSV = "../../data/dataset.csv"
 OUTPUT_JSON = "../../data/clustered_output.json"
 MODEL_NAME = "all-MiniLM-L6-v2"
-SIMILARITY_THRESHOLD = 0.65
+SIMILARITY_THRESHOLD = 0.8
 def generate_guid():
    return str(uuid.uuid4())
@@ -0,0 +1,119 @@
 import json
 from collections import defaultdict, deque
 # -------------------------------
 # CONFIG
 # -------------------------------
 INPUT_FILE = "../../data/clustered_output.json"        # Your original JSON
 OUTPUT_FILE = "../../data/clustered_output2.json"  # Output JSON file
 # -------------------------------
 # Load data
 # -------------------------------
 with open(INPUT_FILE, "r") as f:
    data = json.load(f)
 # -------------------------------
 # Prepare cluster sets
 # -------------------------------
 claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]}
 event_clusters = {e["cluster_id"] for e in data["event_clusters"]}
 all_clusters = claim_clusters.union(event_clusters)
 # -------------------------------
 # Build graph from cluster links
 # -------------------------------
 graph = defaultdict(set)
 for link in data.get("cluster_links", []):
    c_id = link["claim_cluster_id"]
    e_id = link["event_cluster_id"]
    graph[c_id].add(e_id)
    graph[e_id].add(c_id)
 # Make sure all clusters appear in graph (even isolated ones)
 for cid in all_clusters:
    graph[cid] = graph[cid]
 # -------------------------------
 # Find connected components
 # -------------------------------
 visited = set()
 components = []
 for node in graph:
    if node not in visited:
        queue = deque([node])
        component = set()
        while queue:
            current = queue.popleft()
            if current in visited:
                continue
            visited.add(current)
            component.add(current)
            for neighbor in graph[current]:
                if neighbor not in visited:
                    queue.append(neighbor)
        components.append(component)
 # Filter components with size > 8
 large_components = [c for c in components if len(c) > 8 and len(c) < 50]
 # -------------------------------
 # Output stats
 # -------------------------------
 num_components = len(large_components)
 num_nodes = sum(len(c) for c in large_components)
 print("Connected components (size > 8):", num_components)
 print("Total clusters in those components:", num_nodes)
 # -------------------------------
 # Prepare lookup tables
 # -------------------------------
 claim_lookup = {c["id"]: c["text"] for c in data["claims"]}
 event_lookup = {e["id"]: e["text"] for e in data["events"]}
 claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]}
 event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]}
 def extract_texts(component):
    texts = []
    for cid in component:
        if cid in claim_cluster_map:
            texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cid] if mid in claim_lookup])
        elif cid in event_cluster_map:
            texts.extend([event_lookup[mid] for mid in event_cluster_map[cid] if mid in event_lookup])
    return texts
 # -------------------------------
 # Optional: Generate titles
 # -------------------------------
 user_input = input("Generate titles for each component? (y/n): ")
 if user_input.lower() == "y":
    output = []
    for i, comp in enumerate(large_components):
        texts = extract_texts(comp)
        # Show a few sample texts
        print(f"\nComponent {i} sample texts:")
        for t in texts[:5]:
            print("-", t)
        # Ask user for a 3-5 word title (could be automated with OpenAI API)
        title = input("Enter 3-5 word title: ")
        output.append({
            "component_id": i,
            "cluster_ids": list(comp),
            "title": title
        })
    # Save JSON
    with open(OUTPUT_FILE, "w") as f:
        json.dump(output, f, indent=2)
    print(f"Saved cluster titles to {OUTPUT_FILE}")
 else:
    print("No titles generated. Script finished.")