Add date ranges to frontend visualisation

2026-04-24 16:40:10 +01:00
parent f5f8800173
commit ea220e023c
6 changed files with 526 additions and 24 deletions
@@ -0,0 +1,150 @@
+import json
+from collections import defaultdict, deque
+from openai import OpenAI
+from tqdm import tqdm 
+from dotenv import load_dotenv
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# -------------------------------
+# Load environment and OpenAI client
+# -------------------------------
+load_dotenv()  # Load environment variables from .env file
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+# -------------------------------
+# CONFIG
+# -------------------------------
+INPUT_FILE = "../../data/clustered_output.json"        # Your original JSON
+OUTPUT_FILE = "../../data/clustered_output_time.json"     # Output JSON file
+OPENAI_MODEL = "gpt-5-nano"
+
+# -------------------------------
+# Load data
+# -------------------------------
+with open(INPUT_FILE, "r") as f:
+    data = json.load(f)
+
+# -------------------------------
+# Prepare cluster sets
+# -------------------------------
+claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]}
+event_clusters = {e["cluster_id"] for e in data["event_clusters"]}
+all_clusters = claim_clusters.union(event_clusters)
+
+# -------------------------------
+# Build graph
+# -------------------------------
+graph = defaultdict(set)
+for link in data.get("cluster_links", []):
+    c_id = link["claim_cluster_id"]
+    e_id = link["event_cluster_id"]
+    graph[c_id].add(e_id)
+    graph[e_id].add(c_id)
+
+for cid in all_clusters:
+    graph[cid] = graph[cid]
+
+# -------------------------------
+# Find connected components
+# -------------------------------
+visited = set()
+components = []
+
+for node in graph:
+    if node not in visited:
+        queue = deque([node])
+        component = set()
+        while queue:
+            current = queue.popleft()
+            if current in visited:
+                continue
+            visited.add(current)
+            component.add(current)
+            for neighbor in graph[current]:
+                if neighbor not in visited:
+                    queue.append(neighbor)
+        components.append(component)
+
+# Filter components with size > 8 and < 50
+large_components = [c for c in components if len(c) > 1000]
+
+print("Connected components (size > 8):", len(large_components))
+print("Total clusters in those components:", sum(len(c) for c in large_components))
+
+# -------------------------------
+# Prepare lookups
+# -------------------------------
+claim_lookup = {c["id"]: c["text"] for c in data["claims"]}
+event_lookup = {e["id"]: e["text"] for e in data["events"]}
+claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]}
+event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]}
+
+def extract_texts_for_cluster(cluster_id):
+    texts = []
+    if cluster_id in claim_cluster_map:
+        texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cluster_id] if mid in claim_lookup])
+    elif cluster_id in event_cluster_map:
+        texts.extend([event_lookup[mid] for mid in event_cluster_map[cluster_id] if mid in event_lookup])
+    return texts
+
+# -------------------------------
+# GPT-based title generation
+# -------------------------------
+def generate_title(texts):
+    prompt = (
+        "Summarize the following texts into a concise 3 - 6 word title that captures the main theme:\n\n"
+        + "\n".join(f"- {t}" for t in texts) +
+        "\n\nTitle:"
+    )
+    try:
+        # response = client.chat.completions.create(
+        #     model=OPENAI_MODEL,
+        #     messages=[
+        #         {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."},
+        #         {"role": "user", "content": prompt}
+        #     ]
+        # )
+        # title = response.choices[0].message.content.strip()
+        # if title.lower().startswith("title:"):
+        #     title = title[6:].strip()
+        # return title
+        return "UNNAMED"
+    except Exception as e:
+        print("Error generating title:", e)
+        return "Untitled Cluster"
+
+# -------------------------------
+# Wrapper for parallel execution
+# -------------------------------
+def generate_title_for_cluster(cluster_id):
+    texts = extract_texts_for_cluster(cluster_id)
+    title = generate_title(texts)
+    return {"cluster_id": cluster_id, "title": title}
+
+# -------------------------------
+# Generate titles in parallel
+# -------------------------------
+clusters_in_large_components = [cid for comp in large_components for cid in comp]
+output = []
+
+print("\nGenerating GPT titles for clusters (parallel)...")
+
+with ThreadPoolExecutor(max_workers=10) as executor:
+    future_to_cluster = {executor.submit(generate_title_for_cluster, cid): cid for cid in clusters_in_large_components}
+    for future in tqdm(as_completed(future_to_cluster), total=len(clusters_in_large_components), desc="Clusters", ncols=100):
+        try:
+            result = future.result()
+            output.append(result)
+        except Exception as e:
+            cid = future_to_cluster[future]
+            print(f"Error processing cluster {cid}: {e}")
+            output.append({"cluster_id": cid, "title": "Untitled Cluster"})
+
+# -------------------------------
+# Save JSON
+# -------------------------------
+with open(OUTPUT_FILE, "w") as f:
+    json.dump(output, f, indent=2)
+
+print(f"\nSaved cluster titles to {OUTPUT_FILE}")