Add create clusters init vers
This commit is contained in:
@@ -52,11 +52,71 @@ function buildGraph(data) {
|
|||||||
return { nodes, links };
|
return { nodes, links };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getConnectedComponents(nodes, links) {
|
||||||
|
const adj = new Map();
|
||||||
|
|
||||||
|
nodes.forEach(n => adj.set(n.id, new Set()));
|
||||||
|
|
||||||
|
links.forEach(l => {
|
||||||
|
adj.get(l.source)?.add(l.target);
|
||||||
|
adj.get(l.target)?.add(l.source);
|
||||||
|
});
|
||||||
|
|
||||||
|
const visited = new Set();
|
||||||
|
const components = [];
|
||||||
|
|
||||||
|
for (const node of nodes) {
|
||||||
|
if (visited.has(node.id)) continue;
|
||||||
|
|
||||||
|
const stack = [node.id];
|
||||||
|
const comp = [];
|
||||||
|
|
||||||
|
while (stack.length) {
|
||||||
|
const id = stack.pop();
|
||||||
|
if (visited.has(id)) continue;
|
||||||
|
|
||||||
|
visited.add(id);
|
||||||
|
comp.push(id);
|
||||||
|
|
||||||
|
adj.get(id)?.forEach(nei => {
|
||||||
|
if (!visited.has(nei)) stack.push(nei);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
components.push(comp);
|
||||||
|
}
|
||||||
|
|
||||||
|
return components;
|
||||||
|
}
|
||||||
|
|
||||||
export function App() {
|
export function App() {
|
||||||
const fgRef = useRef();
|
const fgRef = useRef();
|
||||||
const [selectedNode, setSelectedNode] = useState(null);
|
const [selectedNode, setSelectedNode] = useState(null);
|
||||||
|
const [minGraphSize, setMinGraphSize] = useState(10);
|
||||||
|
|
||||||
const graphData = useMemo(() => buildGraph(data), []);
|
const graphData = useMemo(() => {
|
||||||
|
const full = buildGraph(data);
|
||||||
|
|
||||||
|
const components = getConnectedComponents(full.nodes, full.links);
|
||||||
|
|
||||||
|
// keep only components large enough
|
||||||
|
const validIds = new Set(
|
||||||
|
components
|
||||||
|
.filter(comp => comp.length >= minGraphSize && comp.length < 50)
|
||||||
|
.flat()
|
||||||
|
);
|
||||||
|
|
||||||
|
const filteredNodes = full.nodes.filter(n => validIds.has(n.id));
|
||||||
|
|
||||||
|
const filteredLinks = full.links.filter(
|
||||||
|
l => validIds.has(l.source) && validIds.has(l.target)
|
||||||
|
);
|
||||||
|
|
||||||
|
return {
|
||||||
|
nodes: filteredNodes,
|
||||||
|
links: filteredLinks
|
||||||
|
};
|
||||||
|
}, [minGraphSize]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!fgRef.current) return;
|
if (!fgRef.current) return;
|
||||||
@@ -83,7 +143,7 @@ export function App() {
|
|||||||
);
|
);
|
||||||
|
|
||||||
fgRef.current.d3ReheatSimulation();
|
fgRef.current.d3ReheatSimulation();
|
||||||
}, []);
|
}, [graphData]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
@@ -149,19 +209,30 @@ export function App() {
|
|||||||
<div
|
<div
|
||||||
style={{
|
style={{
|
||||||
position: "absolute",
|
position: "absolute",
|
||||||
top: 0,
|
top: "10px",
|
||||||
right: 0,
|
right: "10px",
|
||||||
borderRadius: "3px",
|
borderRadius: "3px",
|
||||||
backgroundColor: "gray",
|
backgroundColor: "gray",
|
||||||
padding: "10px",
|
padding: "20px",
|
||||||
maxWidth: "300px"
|
maxWidth: "500px"
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
|
<h2>Config</h2>
|
||||||
|
<label>
|
||||||
|
Min connected graph size: <strong>{minGraphSize}</strong>
|
||||||
|
</label>
|
||||||
|
<br />
|
||||||
|
<input
|
||||||
|
type="range"
|
||||||
|
min="8"
|
||||||
|
max="49"
|
||||||
|
value={minGraphSize}
|
||||||
|
onChange={(e) => setMinGraphSize(Number(e.target.value))}
|
||||||
|
/>
|
||||||
|
|
||||||
<h2>Details</h2>
|
<h2>Details</h2>
|
||||||
{selectedNode ? (
|
{selectedNode ? (
|
||||||
<div>
|
<div>
|
||||||
<p><strong>ID:</strong> {selectedNode.id}</p>
|
|
||||||
<p><strong>Type:</strong> {selectedNode.type}</p>
|
|
||||||
<p><strong>Title:</strong> {selectedNode.label}</p>
|
<p><strong>Title:</strong> {selectedNode.label}</p>
|
||||||
|
|
||||||
{selectedNode.members && (
|
{selectedNode.members && (
|
||||||
|
|||||||
@@ -10,10 +10,10 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
INPUT_CSV = "../../data/dataset-dev.csv"
|
INPUT_CSV = "../../data/dataset.csv"
|
||||||
OUTPUT_JSON = "../../data/clustered_output.json"
|
OUTPUT_JSON = "../../data/clustered_output.json"
|
||||||
MODEL_NAME = "all-MiniLM-L6-v2"
|
MODEL_NAME = "all-MiniLM-L6-v2"
|
||||||
SIMILARITY_THRESHOLD = 0.65
|
SIMILARITY_THRESHOLD = 0.8
|
||||||
|
|
||||||
def generate_guid():
|
def generate_guid():
|
||||||
return str(uuid.uuid4())
|
return str(uuid.uuid4())
|
||||||
|
|||||||
@@ -0,0 +1,119 @@
|
|||||||
|
import json
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# CONFIG
|
||||||
|
# -------------------------------
|
||||||
|
INPUT_FILE = "../../data/clustered_output.json" # Your original JSON
|
||||||
|
OUTPUT_FILE = "../../data/clustered_output2.json" # Output JSON file
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Load data
|
||||||
|
# -------------------------------
|
||||||
|
with open(INPUT_FILE, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Prepare cluster sets
|
||||||
|
# -------------------------------
|
||||||
|
claim_clusters = {c["cluster_id"] for c in data["claim_clusters"]}
|
||||||
|
event_clusters = {e["cluster_id"] for e in data["event_clusters"]}
|
||||||
|
all_clusters = claim_clusters.union(event_clusters)
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Build graph from cluster links
|
||||||
|
# -------------------------------
|
||||||
|
graph = defaultdict(set)
|
||||||
|
for link in data.get("cluster_links", []):
|
||||||
|
c_id = link["claim_cluster_id"]
|
||||||
|
e_id = link["event_cluster_id"]
|
||||||
|
graph[c_id].add(e_id)
|
||||||
|
graph[e_id].add(c_id)
|
||||||
|
|
||||||
|
# Make sure all clusters appear in graph (even isolated ones)
|
||||||
|
for cid in all_clusters:
|
||||||
|
graph[cid] = graph[cid]
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Find connected components
|
||||||
|
# -------------------------------
|
||||||
|
visited = set()
|
||||||
|
components = []
|
||||||
|
|
||||||
|
for node in graph:
|
||||||
|
if node not in visited:
|
||||||
|
queue = deque([node])
|
||||||
|
component = set()
|
||||||
|
while queue:
|
||||||
|
current = queue.popleft()
|
||||||
|
if current in visited:
|
||||||
|
continue
|
||||||
|
visited.add(current)
|
||||||
|
component.add(current)
|
||||||
|
for neighbor in graph[current]:
|
||||||
|
if neighbor not in visited:
|
||||||
|
queue.append(neighbor)
|
||||||
|
components.append(component)
|
||||||
|
|
||||||
|
# Filter components with size > 8
|
||||||
|
large_components = [c for c in components if len(c) > 8 and len(c) < 50]
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Output stats
|
||||||
|
# -------------------------------
|
||||||
|
num_components = len(large_components)
|
||||||
|
num_nodes = sum(len(c) for c in large_components)
|
||||||
|
|
||||||
|
print("Connected components (size > 8):", num_components)
|
||||||
|
print("Total clusters in those components:", num_nodes)
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Prepare lookup tables
|
||||||
|
# -------------------------------
|
||||||
|
claim_lookup = {c["id"]: c["text"] for c in data["claims"]}
|
||||||
|
event_lookup = {e["id"]: e["text"] for e in data["events"]}
|
||||||
|
|
||||||
|
claim_cluster_map = {c["cluster_id"]: c["members"] for c in data["claim_clusters"]}
|
||||||
|
event_cluster_map = {e["cluster_id"]: e["members"] for e in data["event_clusters"]}
|
||||||
|
|
||||||
|
def extract_texts(component):
|
||||||
|
texts = []
|
||||||
|
for cid in component:
|
||||||
|
if cid in claim_cluster_map:
|
||||||
|
texts.extend([claim_lookup[mid] for mid in claim_cluster_map[cid] if mid in claim_lookup])
|
||||||
|
elif cid in event_cluster_map:
|
||||||
|
texts.extend([event_lookup[mid] for mid in event_cluster_map[cid] if mid in event_lookup])
|
||||||
|
return texts
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Optional: Generate titles
|
||||||
|
# -------------------------------
|
||||||
|
user_input = input("Generate titles for each component? (y/n): ")
|
||||||
|
|
||||||
|
if user_input.lower() == "y":
|
||||||
|
output = []
|
||||||
|
|
||||||
|
for i, comp in enumerate(large_components):
|
||||||
|
texts = extract_texts(comp)
|
||||||
|
|
||||||
|
# Show a few sample texts
|
||||||
|
print(f"\nComponent {i} sample texts:")
|
||||||
|
for t in texts[:5]:
|
||||||
|
print("-", t)
|
||||||
|
|
||||||
|
# Ask user for a 3-5 word title (could be automated with OpenAI API)
|
||||||
|
title = input("Enter 3-5 word title: ")
|
||||||
|
|
||||||
|
output.append({
|
||||||
|
"component_id": i,
|
||||||
|
"cluster_ids": list(comp),
|
||||||
|
"title": title
|
||||||
|
})
|
||||||
|
|
||||||
|
# Save JSON
|
||||||
|
with open(OUTPUT_FILE, "w") as f:
|
||||||
|
json.dump(output, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Saved cluster titles to {OUTPUT_FILE}")
|
||||||
|
else:
|
||||||
|
print("No titles generated. Script finished.")
|
||||||
Reference in New Issue
Block a user