LLMsForDisinformationPredic…/graphviz/processing/process_clusters.py

import json
from collections import defaultdict, deque
import openai
from tqdm import tqdm
from dotenv import load_dotenv
import os

# -------------------------------
# CONFIG
# -------------------------------
INPUT_FILE = "../../data/clustered_output.json"        # Your original JSON
OUTPUT_FILE = "../../data/clustered_output2.json"  # Output JSON file
OPENAI_MODEL = "gpt-5-nano"

load_dotenv()  # Load environment variables from .env file
openai.api_key = os.getenv("OPENAI_API_KEY")

# -------------------------------
# Load data
# -------------------------------
with open(INPUT_FILE, "r") as f:
    data = json.load(f)

# -------------------------------
# Prepare cluster sets
# -------------------------------
claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]}
event_clusters = {e["cluster_id"] for e in data["event_clusters"]}
all_clusters = claim_clusters.union(event_clusters)

# -------------------------------
# Build graph
# -------------------------------
graph = defaultdict(set)
for link in data.get("cluster_links", []):
    c_id = link["claim_cluster_id"]
    e_id = link["event_cluster_id"]
    graph[c_id].add(e_id)
    graph[e_id].add(c_id)

for cid in all_clusters:
    graph[cid] = graph[cid]

# -------------------------------
# Find connected components
# -------------------------------
visited = set()
components = []

for node in graph:
    if node not in visited:
        queue = deque([node])
        component = set()
        while queue:
            current = queue.popleft()
            if current in visited:
                continue
            visited.add(current)
            component.add(current)
            for neighbor in graph[current]:
                if neighbor not in visited:
                    queue.append(neighbor)
        components.append(component)

# Filter components with size > 8
large_components = [c for c in components if len(c) > 8 and len(c) < 50]

print("Connected components (size > 8):", len(large_components))
print("Total clusters in those components:", sum(len(c) for c in large_components))

# -------------------------------
# Prepare lookups
# -------------------------------
claim_lookup = {c["id"]: c["text"] for c in data["claims"]}
event_lookup = {e["id"]: e["text"] for e in data["events"]}
claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]}
event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]}

def extract_texts_for_cluster(cluster_id):
    texts = []
    if cluster_id in claim_cluster_map:
        texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cluster_id] if mid in claim_lookup])
    elif cluster_id in event_cluster_map:
        texts.extend([event_lookup[mid] for mid in event_cluster_map[cluster_id] if mid in event_lookup])
    return texts

# -------------------------------
# GPT-based title generation
# -------------------------------
def generate_title(texts):
    prompt = (
        "Summarize the following texts into a concise 3 - 5 word title that captures the main theme:\n\n"
        + "\n".join(f"- {t}" for t in texts) +
        "\n\nTitle:"
    )
    try:
        response = openai.ChatCompletion.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=20
        )
        title = response.choices[0].message["content"].strip()
        return title
    except Exception as e:
        print("Error generating title:", e)
        return "Untitled Cluster"

# -------------------------------
# Generate title per cluster with progress bar
# -------------------------------
clusters_in_large_components = [cid for comp in large_components for cid in comp]
output = []

print("\nGenerating GPT titles for clusters...")
for cluster_id in tqdm(clusters_in_large_components, desc="Clusters", ncols=100):
    texts = extract_texts_for_cluster(cluster_id)
    title = generate_title(texts)
    output.append({
        "cluster_id": cluster_id,
        "title": title
    })

# -------------------------------
# Save JSON
# -------------------------------
with open(OUTPUT_FILE, "w") as f:
    json.dump(output, f, indent=2)

print(f"\nSaved cluster titles to {OUTPUT_FILE}")