Update how scoring works with two passes of the data for timesaving. Add section on edge case handling to rules.

2026-02-26 10:09:36 +00:00
parent 8317fd85df
commit 6c3aa7343d
3 changed files with 167 additions and 267 deletions
@@ -5,17 +5,15 @@ import random
 from pathlib import Path
 from collections import Counter, defaultdict
 import pandas as pd
+from streamlit_sortables import sort_items

-# Path to your JSONL file
 INPUT_FILE = "../../data/results.jsonl"
 OUTPUT_FILE = "../../data/ranked.jsonl"

 # --------------------------
 # Helper functions
 # --------------------------
-
 def load_data(file_path):
-    """Load JSONL file into a list of dicts with parsed content."""
    data = []

    if Path(file_path).exists():
@@ -25,29 +23,20 @@ def load_data(file_path):
                    continue

                entry = json.loads(line)
-
                outputs = entry.get("output", [])

-                # ---- normalize format ----
-                # old format: list
-                # new format: single dict
                if isinstance(outputs, dict):
                    outputs = [outputs]

-                # ---- parse content ----
                for o in outputs:
                    content = o.get("content")
-
                    if content:
                        try:
                            o["content_parsed"] = json.loads(content)
                        except json.JSONDecodeError:
                            o["content_parsed"] = []
-                            print("parse error")

-                # optionally store normalized outputs back
                entry["output"] = outputs
-
                data.append(entry)

    return data
@@ -57,7 +46,6 @@ def save_data_clean(file_path, data):
    merged = {}

    for entry in data:
-        # collect all content_parsed items from this entry
        events = []
        for o in entry.get("output", []):
            if "content_parsed" in o:
@@ -68,31 +56,25 @@ def save_data_clean(file_path, data):
            continue

        if doc_url not in merged:
-            # take the first object's other values
            new_entry = entry.copy()
            new_entry["events"] = events
-
-            # remove unwanted fields safely
            new_entry.pop("output", None)
            new_entry.pop("status", None)
-
            merged[doc_url] = new_entry
        else:
-            # merge events into existing entry
            merged[doc_url]["events"].extend(events)

-    # sort events by human_score
    for entry in merged.values():
        entry["events"].sort(
            key=lambda e: e.get("human_score", 0),
-            reverse=True  # highest score first; remove if you want ascending
+            reverse=True
        )

-    # write merged results
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in merged.values():
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

+
 def save_data(file_path, data):
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in data:
@@ -129,11 +111,17 @@ st.title("Claim Visualizer")
 # --------------------------
 view = st.sidebar.selectbox(
    "Choose View",
-    ["All Claims", "Single Claim Random", "View Rules", "Statistics"]
+    [
+        "All Claims",
+        "Single Claim Random",
+        "Rank Perfect Events",
+        "View Rules",
+        "Statistics"
+    ]
 )

 # --------------------------
-# ALL CLAIMS VIEW
+# View/AllClaims
 # --------------------------
 if view == "All Claims":
    st.header("All Claims")
@@ -150,51 +138,33 @@ if view == "All Claims":
                st.markdown("---")

 # --------------------------
-# SINGLE CLAIM RANDOM VIEW
+# View/Annotate
 # --------------------------
-
 elif view == "Single Claim Random":
-    # Select new entry if needed
+
    if st.session_state.current_claim is None:

-        unscored_entries = []
+        unannotated = []

        for entry in st.session_state.data:
-            unscored = []
+            claims = []

            for o in entry.get("output", []):
                for c in o.get("content_parsed", []):
-                    if c.get("human_score") is None:
-                        unscored.append(c)
+                    if not c.get("ranked"):
+                        claims.append(c)

-            if unscored:
-                # try to find an existing entry with same documentUrl
-                existing = next(
-                    (item for item in unscored_entries
-                    if item["entry"]["documentUrl"] == entry["documentUrl"]),
-                    None
-                )
+            if claims:
+                unannotated.append({"entry": entry, "claims": claims})

-                if existing:
-                    # append new claims to existing entry
-                    existing["claims"].extend(unscored)
-                else:
-                    # create new object
-                    unscored_entries.append({
-                        "entry": entry,
-                        "claims": list(unscored)
-                    })
-
-        if unscored_entries:
-            st.session_state.current_claim = random.choice(unscored_entries)
+        if unannotated:
+            st.session_state.current_claim = random.choice(unannotated)
            st.session_state.drag_order = None
-        else:
-            st.session_state.current_claim = None

    bundle = st.session_state.current_claim

    if bundle is None:
-        st.info("No entries remaining without human scores.")
+        st.info("All items annotated.")
    else:
        entry = bundle["entry"]
        claims = bundle["claims"]
@@ -202,144 +172,164 @@ elif view == "Single Claim Random":
        st.subheader(entry.get("text"))
        st.write(entry.get("normalized", ""))

-        # --------------------------
-        # Stable Drag IDs (FIX)
-        # --------------------------
-
-        claim_ids = [str(i) for i in range(len(claims))]
-
-        # Initialize order only once
-        if (
-            st.session_state.drag_order is None
-            or len(st.session_state.drag_order) != len(claim_ids)
-        ):
-            st.session_state.drag_order = claim_ids.copy()
-
-        ordered_indices = [
-            int(i) for i in st.session_state.drag_order
-        ]
-
-        # --------------------------
-        # Annotation Section
-        # --------------------------
-
        st.subheader("Annotate Events")

-        for pos, idx in enumerate(ordered_indices):
-            c = claims[idx]
+        for idx, c in enumerate(claims):

            with st.container(border=True):

                st.markdown(f"**Event:** {c.get('event')}")
-                st.markdown(
-                    f"**Reasoning:** {c.get('reasoningWhyRelevant')}"
-                )
+                st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}")

                cols = st.columns(7)
-
                temp = ""

-                with cols[0]:
-                    a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') )
-                    temp += "REWORDING " if a else ""
+                labels = [
+                    ("Rewording", "REWORDING"),
+                    ("Not Specific", "NSPECIFIC"),
+                    ("Time Incorrect", "TINCORRECT"),
+                    ("Story?", "STORY"),
+                    ("Duplicate?", "DUPLICATE"),
+                    ("Bias Shown", "BIAS"),
+                    ("Perfect", "PERFECT"),
+                ]

-                with cols[1]:
-                    a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') )
-                    temp += "NSPECIFIC " if a else ""
+                for i, (name, tag) in enumerate(labels):
+                    with cols[i]:
+                        if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"):
+                            temp += tag + " "

-                with cols[2]:
-                    a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') )
-                    temp += "TINCORRECT " if a else ""
-
-                with cols[3]:
-                    a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') )
-                    temp += "STORY " if a else ""
-
-                with cols[4]:
-                    a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') )
-                    temp += "DUPLICATE " if a else ""
-                
-                with cols[5]:
-                    a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') )
-                    temp += "BIAS " if a else ""
-                
-                with cols[6]:
-                    a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') )
-                    temp += "PERFECT " if a else ""
-
-                c["extra_info"] = temp
-
-                # ---- MOVE BUTTONS ----
-                move_cols = st.columns(2)
-
-                with move_cols[0]:
-                    if st.button(
-                        "Up",
-                        key="UP" + str(idx) + c.get("event")
-                    ):
-                        if pos > 0:
-                            order = st.session_state.drag_order
-                            order[pos], order[pos - 1] = order[pos - 1], order[pos]
-                            st.session_state.drag_order = order
-                            st.rerun()
-
-                with move_cols[1]:
-                    if st.button(
-                        "Down",
-                        key="DOWN" + str(idx) + c.get("event")
-                    ):
-                        if pos < len(st.session_state.drag_order) - 1:
-                            order = st.session_state.drag_order
-                            order[pos], order[pos + 1] = order[pos + 1], order[pos]
-                            st.session_state.drag_order = order
-                            st.rerun()
-
-
-        # --------------------------
-        # Submit Ranking
-        # --------------------------
-
-        if st.button("Submit Ranking"):
-
-            n = len(ordered_indices)
-
-            for rank_position, idx in enumerate(ordered_indices):
-
-                claim_obj = claims[idx]
-                score = 0
-                if n == 1:
-                    score = 1.0
-                else:
-                    score = 1 - (rank_position / (n - 1))
-
-                if (claim_obj["extra_info"] != ""):
-                    if (claim_obj["extra_info"].find("PERFECT") != -1):
-                        score = 1
-                    elif(claim_obj["extra_info"].find("DUPLICATE") != -1):
-                        score = 0
-                    else:
-                        score *= 0.5
-                    
-
-                claim_obj["human_score"] = round(score, 3)
+                c["extra_info"] = temp.strip()
+                c["ranked"] = True

+        if st.button("Save Annotation"):
            save_data(INPUT_FILE, st.session_state.data)
-            save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
-
-
-            print("Ranking converted to scores and saved!")
-
            st.session_state.current_claim = None
-            st.session_state.drag_order = None
-
+            print("Annotation saved")
            st.rerun()

+# --------------------------
+# View/Rank
+# --------------------------
+
+elif view == "Rank Perfect Events":
+
+    st.header("Rank PERFECT Events")
+    candidates = []
+
+    for entry in st.session_state.data:
+        perfect = []
+
+        for o in entry.get("output", []):
+            for c in o.get("content_parsed", []):
+                if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"):
+                    perfect.append(c)
+
+        if perfect:
+            candidates.append({"entry": entry, "claims": perfect})
+
+    if not candidates:
+        st.info("No PERFECT events available.")
+        st.stop()
+
+    if "current_bundle" not in st.session_state:
+        st.session_state.current_bundle = random.choice(candidates)
+
+    bundle = st.session_state.current_bundle
+    entry = bundle["entry"]
+    claims = bundle["claims"]
+
+    st.subheader(entry.get("text"))
+
+    # init
+    if "perfect_order" not in st.session_state:
+        st.session_state.perfect_order = list(range(len(claims)))
+
+    order = st.session_state.perfect_order
+
+    # labels shown in sortable UI
+    labels = [
+        f"{i+1}. {claims[idx].get('event')}"
+        for i, idx in enumerate(order)
+    ]
+
+    st.markdown("### Drag to reorder:")
+
+    # -------------------------
+    # Drag & drop UI
+    # -------------------------
+    new_labels = sort_items(labels)
+
+    # Convert reordered labels back → indices
+    if new_labels != labels:
+        new_order = []
+        for lbl in new_labels:
+            original_pos = labels.index(lbl)
+            new_order.append(order[original_pos])
+
+        st.session_state.perfect_order = new_order
+        order = new_order
+
+    st.markdown("---")
+    for rank, idx in enumerate(order):
+        c = claims[idx]
+        st.markdown(f"**Rank {rank+1}: {c.get('event')}**")
+        st.markdown(c.get("reasoningWhyRelevant"))
+        st.markdown("---")
+
+    if st.button("Submit PERFECT Ranking"):
+
+        n = len(order)
+
+        for rank_position, idx in enumerate(order):
+            claim_obj = claims[idx]
+
+            # explicit stored rank
+            claim_obj["rank_position"] = rank_position + 1
+
+            claim_obj["human_score"] = 1
+
+        # Auto-scoring
+        for entry in st.session_state.data:
+            for o in entry.get("output", []):
+                for c in o.get("content_parsed", []):
+
+                    if c.get("human_score") is not None:
+                        continue
+
+                    extra = c.get("extra_info", "")
+
+                    if "DUPLICATE" in extra:
+                        c["human_score"] = 0
+                    elif extra:
+                        c["human_score"] = round(
+                            c.get("score", 0) * 0.5, 3
+                        )
+
+        save_data(INPUT_FILE, st.session_state.data)
+        save_data_clean(
+            OUTPUT_FILE,
+            copy.deepcopy(st.session_state.data)
+        )
+
+        # reset state for next example
+        del st.session_state.current_bundle
+        del st.session_state.perfect_order
+
+        print("Ranking saved!")
+        st.rerun()
+
+# --------------------------
+# View/Rules
+# --------------------------
 elif view == "View Rules":
    with open("rules.txt", "r", encoding="utf-8") as f:
        st.write(f.read())

+# --------------------------
+# View/Statistics
+# --------------------------
 elif view == "Statistics":
-
    st.header("Statistics")

    word_counter = Counter()
@@ -359,20 +349,9 @@ elif view == "Statistics":
                    words = extra.strip().split()
                    word_counter.update(words)

-                # ---- human score aggregation ----
-                hs = c.get("human_score")
-                if hs is not None and doc_url:
-                    doc_scores[doc_url].append(hs)
-
-                # ---- diff score aggregation ----
-                s = c.get("score")
-                if hs is not None and s is not None and doc_url:
-                    diff = abs(hs - s)
-                    diff_scores[doc_url].append(diff)
-
-    # ==========================
+    # --------------------------
    # Extra Info Word Counts
-    # ==========================
+    # --------------------------
    st.subheader("Extra Info Label Counts")

    if word_counter:
@@ -384,87 +363,4 @@ elif view == "Statistics":
        st.dataframe(df_words)
        st.bar_chart(df_words.set_index("Label"))
    else:
-        st.info("No extra_info data available yet.")
-
-    # ==========================
-    # Avg Human Score per Document
-    # ==========================
-    st.subheader("Average Human Score per documentUrl")
-
-    avg_scores = []
-
-    for doc, scores in doc_scores.items():
-        if scores:
-            avg_scores.append({
-                "documentUrl": doc,
-                "avg_human_score": sum(scores) / len(scores),
-                "num_events": len(scores)
-            })
-
-    if avg_scores:
-        df_scores = pd.DataFrame(avg_scores).sort_values(
-            "avg_human_score",
-            ascending=False
-        )
-
-        st.dataframe(df_scores)
-        # ==========================
-        # Distribution (rounded to 0.1)
-        # ==========================
-
-        st.subheader("Distribution of Average Human Scores (Rounded to 0.1)")
-
-        # round averages to nearest 0.1
-        df_scores["rounded_score"] = (
-            df_scores["avg_human_score"].round(1)
-        )
-
-        # count how many docs fall into each bucket
-        dist = (
-            df_scores["rounded_score"]
-            .value_counts()
-            .sort_index()
-            .reset_index()
-        )
-
-        dist.columns = ["rounded_score", "count"]
-
-        # ensure all bins from 0.0 → 1.0 exist
-        all_bins = pd.DataFrame({
-            "rounded_score": [round(x * 0.1, 1) for x in range(11)]
-        })
-
-        dist = (
-            all_bins.merge(dist, on="rounded_score", how="left")
-            .fillna(0)
-        )
-
-        dist["count"] = dist["count"].astype(int)
-
-        # plot counts per score bucket
-        st.bar_chart(
-            dist.set_index("rounded_score")["count"]
-        )
-    else:
-        st.info("No human scores available yet.")
-
-    # ==========================
-    # Overall Model vs Human Difference
-    # ==========================
-    st.subheader("Model vs Human Agreement")
-
-    all_diffs = [
-        diff
-        for diffs in diff_scores.values()
-        for diff in diffs
-    ]
-
-    if all_diffs:
-        avg_diff = sum(all_diffs) / len(all_diffs)
-
-        st.write(
-            f"Average absolute difference between model score and human score: "
-            f"**{avg_diff:.3f}**"
-        )
-    else:
-        st.info("No items have both score and human_score yet.")
+        st.info("No extra_info data available yet.")
@@ -1 +1,2 @@
-streamlit
+streamlit
+streamlit-sortables
@@ -14,4 +14,7 @@

 5. Proposed trigger events should be sufficiently different from one another

-6. Proposed trigger events must be free from bias, and backed up by reliable evidence
+6. Proposed trigger events must be free from bias, and backed up by reliable evidence
+
+Edge case handing:
+In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley