Add initial version of ROBERTA classifier, add ability for multi pi charts

2026-03-11 22:02:31 +00:00
parent ef6330ec07
commit f09e36e740
6 changed files with 299 additions and 33 deletions
@@ -1,4 +1,6 @@
 from collections import Counter
+from pathlib import Path
+import json
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -27,22 +29,6 @@ def render():
                    words = extra.strip().split()
                    word_counter.update(words)

-                # ---- confidence classification ----
-                if score is not None:
-                    extra_lower = extra.strip().lower()
-
-                    if score > THRESH and extra_lower == "perfect":
-                        confidence_counter["Correct"] += 1
-
-                    elif score > THRESH and extra_lower != "perfect":
-                        confidence_counter["Over-confident"] += 1
-
-                    elif score < THRESH and extra_lower == "perfect":
-                        confidence_counter["Under-confident"] += 1
-
-                    else:
-                        confidence_counter["Other"] += 1
-
    # --------------------------
    # Extra Info Word Counts
    # --------------------------
@@ -62,24 +48,62 @@ def render():
    # --------------------------
    # Confidence vs Label Stats
    # --------------------------
-    st.subheader("Confidence vs Label Distribution")
+    st.header("Confidence vs Label Distribution per JSONL File")

-    if confidence_counter:
-        df_conf = pd.DataFrame(
-            confidence_counter.items(),
-            columns=["Category", "Count"]
-        )
+    path = Path("../../data/reranked")

-        fig, ax = plt.subplots()
-        ax.pie(
-            df_conf["Count"],
-            labels=df_conf["Category"],
-            autopct="%1.1f%%",
-            startangle=90
-        )
-        ax.axis("equal")
+    if not path.exists() or not path.is_dir():
+        st.error("Invalid folder path.")
+        return

-        st.pyplot(fig, width=500)
+    jsonl_files = list(path.glob("*.jsonl"))
+    if not jsonl_files:
+        st.info("No .jsonl files found in this folder.")
+        return

-    else:
-        st.info("No score data available yet.")
+    for file_path in jsonl_files:
+        st.subheader(f"File: {file_path.name}")
+
+        confidence_counter = Counter()
+
+        # ---- Read file line by line ----
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                try:
+                    entry = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+
+                for event in entry.get("events", []):
+                    score = event.get("score", None)
+                    extra_lower = (event.get("extra_info", "") or "").strip().lower()
+                    print(extra_lower)
+                    if score is not None:
+                        if score > THRESH and extra_lower == "perfect":
+                            confidence_counter["Correct"] += 1
+                        elif score > THRESH and extra_lower != "perfect":
+                            confidence_counter["Over-confident"] += 1
+                        elif score < THRESH and extra_lower == "perfect":
+                            confidence_counter["Under-confident"] += 1
+                        else:
+                            confidence_counter["Other"] += 1
+
+        if confidence_counter:
+            df_conf = pd.DataFrame(
+                confidence_counter.items(),
+                columns=["Category", "Count"]
+            )
+
+            fig, ax = plt.subplots()
+            ax.pie(
+                df_conf["Count"],
+                labels=df_conf["Category"],
+                autopct="%1.1f%%",
+                startangle=90
+            )
+            ax.axis("equal")
+            ax.set_title(file_path.name)
+
+            st.pyplot(fig, width=500)
+        else:
+            st.info("No score data available in this file.")