Add date ranges to frontend visualisation
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
import csv
|
||||
import json
|
||||
import uuid
|
||||
from typing import List, Dict
|
||||
|
||||
import dateparser
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
@@ -10,7 +9,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
INPUT_CSV = "../../data/dataset.csv"
|
||||
INPUT_CSV = "../../data/dataset.jsonl"
|
||||
OUTPUT_JSON = "../../data/clustered_output.json"
|
||||
MODEL_NAME = "all-MiniLM-L6-v2"
|
||||
SIMILARITY_THRESHOLD = 0.8
|
||||
@@ -19,37 +18,50 @@ def generate_guid():
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
def read_csv(file_path: str):
|
||||
def read_jsonl(file_path: str):
|
||||
data = []
|
||||
|
||||
with open(file_path, newline='', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
for row in tqdm(reader, desc="Reading CSV"):
|
||||
row = [r.strip() for r in row if r.strip()]
|
||||
if not row:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in tqdm(f, desc="Reading JSONL"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
claim = row[0]
|
||||
events = row[1:]
|
||||
obj = json.loads(line)
|
||||
|
||||
claim_text = obj.get("claim", "").strip()
|
||||
claim_date = obj.get("date", "").strip()
|
||||
events = obj.get("events", [])
|
||||
|
||||
if not claim_text:
|
||||
continue
|
||||
|
||||
claim_id = generate_guid()
|
||||
|
||||
event_objects = []
|
||||
for e in events:
|
||||
event_text = e.get("Event", "").strip()
|
||||
event_date = e.get("Date", "").strip()
|
||||
if not event_text:
|
||||
continue
|
||||
|
||||
event_objects.append({
|
||||
"id": generate_guid(),
|
||||
"text": e
|
||||
"text": event_text,
|
||||
"date": dateparser.parse(event_date)
|
||||
})
|
||||
|
||||
data.append({
|
||||
"claim": {
|
||||
"id": claim_id,
|
||||
"text": claim
|
||||
"text": claim_text,
|
||||
"date": dateparser.parse(claim_date)
|
||||
},
|
||||
"events": event_objects
|
||||
})
|
||||
|
||||
return data
|
||||
return data
|
||||
|
||||
|
||||
def embed_texts(model, texts: List[str], desc="Embedding"):
|
||||
embeddings = []
|
||||
@@ -76,10 +88,10 @@ def main():
|
||||
print("Loading model...")
|
||||
model = SentenceTransformer(MODEL_NAME)
|
||||
|
||||
data = read_csv(INPUT_CSV)
|
||||
data = read_jsonl(INPUT_CSV)
|
||||
|
||||
claim_texts, claim_ids = [], []
|
||||
event_texts, event_ids = [], []
|
||||
claim_texts, claim_ids, claim_dates = [], [], []
|
||||
event_texts, event_ids, event_dates = [], [], []
|
||||
|
||||
raw_links = [] # temporary for cluster mapping
|
||||
|
||||
@@ -87,10 +99,12 @@ def main():
|
||||
claim = entry["claim"]
|
||||
claim_ids.append(claim["id"])
|
||||
claim_texts.append(f"Claim: {claim['text']}")
|
||||
claim_dates.append(claim['date'])
|
||||
|
||||
for event in entry["events"]:
|
||||
event_ids.append(event["id"])
|
||||
event_texts.append(f"Event: {event['text']}")
|
||||
event_dates.append(event['date'])
|
||||
|
||||
raw_links.append({
|
||||
"claim_id": claim["id"],
|
||||
@@ -148,12 +162,12 @@ def main():
|
||||
|
||||
output = {
|
||||
"claims": [
|
||||
{"id": cid, "text": txt.replace("Claim: ", "")}
|
||||
for cid, txt in zip(claim_ids, claim_texts)
|
||||
{"id": cid, "text": txt.replace("Claim: ", ""), "date": str(dat)}
|
||||
for cid, txt, dat in zip(claim_ids, claim_texts, claim_dates)
|
||||
],
|
||||
"events": [
|
||||
{"id": eid, "text": txt.replace("Event: ", "")}
|
||||
for eid, txt in zip(event_ids, event_texts)
|
||||
{"id": eid, "text": txt.replace("Event: ", ""), "date": str(dat)}
|
||||
for eid, txt, dat in zip(event_ids, event_texts, event_dates)
|
||||
],
|
||||
"claim_clusters": [
|
||||
{"cluster_id": k, "members": v}
|
||||
|
||||
@@ -0,0 +1,150 @@
|
||||
import json
|
||||
from collections import defaultdict, deque
|
||||
from openai import OpenAI
|
||||
from tqdm import tqdm
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
# -------------------------------
|
||||
# Load environment and OpenAI client
|
||||
# -------------------------------
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# -------------------------------
|
||||
# CONFIG
|
||||
# -------------------------------
|
||||
INPUT_FILE = "../../data/clustered_output.json" # Your original JSON
|
||||
OUTPUT_FILE = "../../data/clustered_output_time.json" # Output JSON file
|
||||
OPENAI_MODEL = "gpt-5-nano"
|
||||
|
||||
# -------------------------------
|
||||
# Load data
|
||||
# -------------------------------
|
||||
with open(INPUT_FILE, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# -------------------------------
|
||||
# Prepare cluster sets
|
||||
# -------------------------------
|
||||
claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]}
|
||||
event_clusters = {e["cluster_id"] for e in data["event_clusters"]}
|
||||
all_clusters = claim_clusters.union(event_clusters)
|
||||
|
||||
# -------------------------------
|
||||
# Build graph
|
||||
# -------------------------------
|
||||
graph = defaultdict(set)
|
||||
for link in data.get("cluster_links", []):
|
||||
c_id = link["claim_cluster_id"]
|
||||
e_id = link["event_cluster_id"]
|
||||
graph[c_id].add(e_id)
|
||||
graph[e_id].add(c_id)
|
||||
|
||||
for cid in all_clusters:
|
||||
graph[cid] = graph[cid]
|
||||
|
||||
# -------------------------------
|
||||
# Find connected components
|
||||
# -------------------------------
|
||||
visited = set()
|
||||
components = []
|
||||
|
||||
for node in graph:
|
||||
if node not in visited:
|
||||
queue = deque([node])
|
||||
component = set()
|
||||
while queue:
|
||||
current = queue.popleft()
|
||||
if current in visited:
|
||||
continue
|
||||
visited.add(current)
|
||||
component.add(current)
|
||||
for neighbor in graph[current]:
|
||||
if neighbor not in visited:
|
||||
queue.append(neighbor)
|
||||
components.append(component)
|
||||
|
||||
# Filter components with size > 8 and < 50
|
||||
large_components = [c for c in components if len(c) > 1000]
|
||||
|
||||
print("Connected components (size > 8):", len(large_components))
|
||||
print("Total clusters in those components:", sum(len(c) for c in large_components))
|
||||
|
||||
# -------------------------------
|
||||
# Prepare lookups
|
||||
# -------------------------------
|
||||
claim_lookup = {c["id"]: c["text"] for c in data["claims"]}
|
||||
event_lookup = {e["id"]: e["text"] for e in data["events"]}
|
||||
claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]}
|
||||
event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]}
|
||||
|
||||
def extract_texts_for_cluster(cluster_id):
|
||||
texts = []
|
||||
if cluster_id in claim_cluster_map:
|
||||
texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cluster_id] if mid in claim_lookup])
|
||||
elif cluster_id in event_cluster_map:
|
||||
texts.extend([event_lookup[mid] for mid in event_cluster_map[cluster_id] if mid in event_lookup])
|
||||
return texts
|
||||
|
||||
# -------------------------------
|
||||
# GPT-based title generation
|
||||
# -------------------------------
|
||||
def generate_title(texts):
|
||||
prompt = (
|
||||
"Summarize the following texts into a concise 3 - 6 word title that captures the main theme:\n\n"
|
||||
+ "\n".join(f"- {t}" for t in texts) +
|
||||
"\n\nTitle:"
|
||||
)
|
||||
try:
|
||||
# response = client.chat.completions.create(
|
||||
# model=OPENAI_MODEL,
|
||||
# messages=[
|
||||
# {"role": "system", "content": "You are a helpful assistant who creates short, meaningful titles."},
|
||||
# {"role": "user", "content": prompt}
|
||||
# ]
|
||||
# )
|
||||
# title = response.choices[0].message.content.strip()
|
||||
# if title.lower().startswith("title:"):
|
||||
# title = title[6:].strip()
|
||||
# return title
|
||||
return "UNNAMED"
|
||||
except Exception as e:
|
||||
print("Error generating title:", e)
|
||||
return "Untitled Cluster"
|
||||
|
||||
# -------------------------------
|
||||
# Wrapper for parallel execution
|
||||
# -------------------------------
|
||||
def generate_title_for_cluster(cluster_id):
|
||||
texts = extract_texts_for_cluster(cluster_id)
|
||||
title = generate_title(texts)
|
||||
return {"cluster_id": cluster_id, "title": title}
|
||||
|
||||
# -------------------------------
|
||||
# Generate titles in parallel
|
||||
# -------------------------------
|
||||
clusters_in_large_components = [cid for comp in large_components for cid in comp]
|
||||
output = []
|
||||
|
||||
print("\nGenerating GPT titles for clusters (parallel)...")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
future_to_cluster = {executor.submit(generate_title_for_cluster, cid): cid for cid in clusters_in_large_components}
|
||||
for future in tqdm(as_completed(future_to_cluster), total=len(clusters_in_large_components), desc="Clusters", ncols=100):
|
||||
try:
|
||||
result = future.result()
|
||||
output.append(result)
|
||||
except Exception as e:
|
||||
cid = future_to_cluster[future]
|
||||
print(f"Error processing cluster {cid}: {e}")
|
||||
output.append({"cluster_id": cid, "title": "Untitled Cluster"})
|
||||
|
||||
# -------------------------------
|
||||
# Save JSON
|
||||
# -------------------------------
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print(f"\nSaved cluster titles to {OUTPUT_FILE}")
|
||||
@@ -1 +1,2 @@
|
||||
sentence_transformers
|
||||
dateparser
|
||||
Reference in New Issue
Block a user