Update how scoring works with two passes of the data for timesaving. Add section on edge case handling to rules.

2026-02-26 10:09:36 +00:00
parent 8317fd85df
commit 6c3aa7343d
3 changed files with 167 additions and 267 deletions
@@ -5,17 +5,15 @@ import random
 from pathlib import Path
 from collections import Counter, defaultdict
 import pandas as pd
 from streamlit_sortables import sort_items
 # Path to your JSONL file
 INPUT_FILE = "../../data/results.jsonl"
 OUTPUT_FILE = "../../data/ranked.jsonl"
 # --------------------------
 # Helper functions
 # --------------------------
 def load_data(file_path):
    """Load JSONL file into a list of dicts with parsed content."""
    data = []
    if Path(file_path).exists():
@@ -25,29 +23,20 @@ def load_data(file_path):
                    continue
                entry = json.loads(line)
                outputs = entry.get("output", [])
                # ---- normalize format ----
                # old format: list
                # new format: single dict
                if isinstance(outputs, dict):
                    outputs = [outputs]
                # ---- parse content ----
                for o in outputs:
                    content = o.get("content")
                    if content:
                        try:
                            o["content_parsed"] = json.loads(content)
                        except json.JSONDecodeError:
                            o["content_parsed"] = []
                            print("parse error")
                # optionally store normalized outputs back
                entry["output"] = outputs
                data.append(entry)
    return data
@@ -57,7 +46,6 @@ def save_data_clean(file_path, data):
    merged = {}
    for entry in data:
        # collect all content_parsed items from this entry
        events = []
        for o in entry.get("output", []):
            if "content_parsed" in o:
@@ -68,31 +56,25 @@ def save_data_clean(file_path, data):
            continue
        if doc_url not in merged:
            # take the first object's other values
            new_entry = entry.copy()
            new_entry["events"] = events
            # remove unwanted fields safely
            new_entry.pop("output", None)
            new_entry.pop("status", None)
            merged[doc_url] = new_entry
        else:
            # merge events into existing entry
            merged[doc_url]["events"].extend(events)
    # sort events by human_score
    for entry in merged.values():
        entry["events"].sort(
            key=lambda e: e.get("human_score", 0),
-            reverse=True  # highest score first; remove if you want ascending
+            reverse=True
        )
    # write merged results
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in merged.values():
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
 def save_data(file_path, data):
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in data:
@@ -129,11 +111,17 @@ st.title("Claim Visualizer")
 # --------------------------
 view = st.sidebar.selectbox(
    "Choose View",
-    ["All Claims", "Single Claim Random", "View Rules", "Statistics"]
+    [
        "All Claims",
        "Single Claim Random",
        "Rank Perfect Events",
        "View Rules",
        "Statistics"
    ]
 )
 # --------------------------
-# ALL CLAIMS VIEW
+# View/AllClaims
 # --------------------------
 if view == "All Claims":
    st.header("All Claims")
@@ -150,51 +138,33 @@ if view == "All Claims":
                st.markdown("---")
 # --------------------------
-# SINGLE CLAIM RANDOM VIEW
+# View/Annotate
 # --------------------------
 elif view == "Single Claim Random":
-    # Select new entry if needed
+
    if st.session_state.current_claim is None:
-        unscored_entries = []
+        unannotated = []
        for entry in st.session_state.data:
-            unscored = []
+            claims = []
            for o in entry.get("output", []):
                for c in o.get("content_parsed", []):
-                    if c.get("human_score") is None:
+                    if not c.get("ranked"):
-                        unscored.append(c)
+                        claims.append(c)
-            if unscored:
+            if claims:
-                # try to find an existing entry with same documentUrl
+                unannotated.append({"entry": entry, "claims": claims})
                existing = next(
                    (item for item in unscored_entries
                    if item["entry"]["documentUrl"] == entry["documentUrl"]),
                    None
                )
-                if existing:
+        if unannotated:
-                    # append new claims to existing entry
+            st.session_state.current_claim = random.choice(unannotated)
                    existing["claims"].extend(unscored)
                else:
                    # create new object
                    unscored_entries.append({
                        "entry": entry,
                        "claims": list(unscored)
                    })
        if unscored_entries:
            st.session_state.current_claim = random.choice(unscored_entries)
            st.session_state.drag_order = None
        else:
            st.session_state.current_claim = None
    bundle = st.session_state.current_claim
    if bundle is None:
-        st.info("No entries remaining without human scores.")
+        st.info("All items annotated.")
    else:
        entry = bundle["entry"]
        claims = bundle["claims"]
@@ -202,144 +172,164 @@ elif view == "Single Claim Random":
        st.subheader(entry.get("text"))
        st.write(entry.get("normalized", ""))
        # --------------------------
        # Stable Drag IDs (FIX)
        # --------------------------
        claim_ids = [str(i) for i in range(len(claims))]
        # Initialize order only once
        if (
            st.session_state.drag_order is None
            or len(st.session_state.drag_order) != len(claim_ids)
        ):
            st.session_state.drag_order = claim_ids.copy()
        ordered_indices = [
            int(i) for i in st.session_state.drag_order
        ]
        # --------------------------
        # Annotation Section
        # --------------------------
        st.subheader("Annotate Events")
-        for pos, idx in enumerate(ordered_indices):
+        for idx, c in enumerate(claims):
            c = claims[idx]
            with st.container(border=True):
                st.markdown(f"**Event:** {c.get('event')}")
-                st.markdown(
+                st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}")
                    f"**Reasoning:** {c.get('reasoningWhyRelevant')}"
                )
                cols = st.columns(7)
                temp = ""
-                with cols[0]:
+                labels = [
-                    a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') )
+                    ("Rewording", "REWORDING"),
-                    temp += "REWORDING " if a else ""
+                    ("Not Specific", "NSPECIFIC"),
                    ("Time Incorrect", "TINCORRECT"),
                    ("Story?", "STORY"),
                    ("Duplicate?", "DUPLICATE"),
                    ("Bias Shown", "BIAS"),
                    ("Perfect", "PERFECT"),
                ]
-                with cols[1]:
+                for i, (name, tag) in enumerate(labels):
-                    a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') )
+                    with cols[i]:
-                    temp += "NSPECIFIC " if a else ""
+                        if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"):
                            temp += tag + " "
-                with cols[2]:
+                c["extra_info"] = temp.strip()
-                    a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') )
+                c["ranked"] = True
                    temp += "TINCORRECT " if a else ""
-                with cols[3]:
+        if st.button("Save Annotation"):
-                    a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') )
+            save_data(INPUT_FILE, st.session_state.data)
-                    temp += "STORY " if a else ""
+            st.session_state.current_claim = None
-
+            print("Annotation saved")
                with cols[4]:
                    a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') )
                    temp += "DUPLICATE " if a else ""
                with cols[5]:
                    a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') )
                    temp += "BIAS " if a else ""
                with cols[6]:
                    a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') )
                    temp += "PERFECT " if a else ""
                c["extra_info"] = temp
                # ---- MOVE BUTTONS ----
                move_cols = st.columns(2)
                with move_cols[0]:
                    if st.button(
                        "Up",
                        key="UP" + str(idx) + c.get("event")
                    ):
                        if pos > 0:
                            order = st.session_state.drag_order
                            order[pos], order[pos - 1] = order[pos - 1], order[pos]
                            st.session_state.drag_order = order
            st.rerun()
                with move_cols[1]:
                    if st.button(
                        "Down",
                        key="DOWN" + str(idx) + c.get("event")
                    ):
                        if pos < len(st.session_state.drag_order) - 1:
                            order = st.session_state.drag_order
                            order[pos], order[pos + 1] = order[pos + 1], order[pos]
                            st.session_state.drag_order = order
                            st.rerun()
 # --------------------------
-        # Submit Ranking
+# View/Rank
 # --------------------------
-        if st.button("Submit Ranking"):
+elif view == "Rank Perfect Events":
-            n = len(ordered_indices)
+    st.header("Rank PERFECT Events")
    candidates = []
-            for rank_position, idx in enumerate(ordered_indices):
+    for entry in st.session_state.data:
        perfect = []
        for o in entry.get("output", []):
            for c in o.get("content_parsed", []):
                if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"):
                    perfect.append(c)
        if perfect:
            candidates.append({"entry": entry, "claims": perfect})
    if not candidates:
        st.info("No PERFECT events available.")
        st.stop()
    if "current_bundle" not in st.session_state:
        st.session_state.current_bundle = random.choice(candidates)
    bundle = st.session_state.current_bundle
    entry = bundle["entry"]
    claims = bundle["claims"]
    st.subheader(entry.get("text"))
    # init
    if "perfect_order" not in st.session_state:
        st.session_state.perfect_order = list(range(len(claims)))
    order = st.session_state.perfect_order
    # labels shown in sortable UI
    labels = [
        f"{i+1}. {claims[idx].get('event')}"
        for i, idx in enumerate(order)
    ]
    st.markdown("### Drag to reorder:")
    # -------------------------
    # Drag & drop UI
    # -------------------------
    new_labels = sort_items(labels)
    # Convert reordered labels back → indices
    if new_labels != labels:
        new_order = []
        for lbl in new_labels:
            original_pos = labels.index(lbl)
            new_order.append(order[original_pos])
        st.session_state.perfect_order = new_order
        order = new_order
    st.markdown("---")
    for rank, idx in enumerate(order):
        c = claims[idx]
        st.markdown(f"**Rank {rank+1}: {c.get('event')}**")
        st.markdown(c.get("reasoningWhyRelevant"))
        st.markdown("---")
    if st.button("Submit PERFECT Ranking"):
        n = len(order)
        for rank_position, idx in enumerate(order):
            claim_obj = claims[idx]
                score = 0
                if n == 1:
                    score = 1.0
                else:
                    score = 1 - (rank_position / (n - 1))
-                if (claim_obj["extra_info"] != ""):
+            # explicit stored rank
-                    if (claim_obj["extra_info"].find("PERFECT") != -1):
+            claim_obj["rank_position"] = rank_position + 1
                        score = 1
                    elif(claim_obj["extra_info"].find("DUPLICATE") != -1):
                        score = 0
                    else:
                        score *= 0.5
            claim_obj["human_score"] = 1
-                claim_obj["human_score"] = round(score, 3)
+        # Auto-scoring
        for entry in st.session_state.data:
            for o in entry.get("output", []):
                for c in o.get("content_parsed", []):
                    if c.get("human_score") is not None:
                        continue
                    extra = c.get("extra_info", "")
                    if "DUPLICATE" in extra:
                        c["human_score"] = 0
                    elif extra:
                        c["human_score"] = round(
                            c.get("score", 0) * 0.5, 3
                        )
        save_data(INPUT_FILE, st.session_state.data)
-            save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
+        save_data_clean(
            OUTPUT_FILE,
            copy.deepcopy(st.session_state.data)
        )
        # reset state for next example
        del st.session_state.current_bundle
        del st.session_state.perfect_order
-            print("Ranking converted to scores and saved!")
+        print("Ranking saved!")
            st.session_state.current_claim = None
            st.session_state.drag_order = None
        st.rerun()
 # --------------------------
 # View/Rules
 # --------------------------
 elif view == "View Rules":
    with open("rules.txt", "r", encoding="utf-8") as f:
        st.write(f.read())
 # --------------------------
 # View/Statistics
 # --------------------------
 elif view == "Statistics":
    st.header("Statistics")
    word_counter = Counter()
@@ -359,20 +349,9 @@ elif view == "Statistics":
                    words = extra.strip().split()
                    word_counter.update(words)
-                # ---- human score aggregation ----
+    # --------------------------
                hs = c.get("human_score")
                if hs is not None and doc_url:
                    doc_scores[doc_url].append(hs)
                # ---- diff score aggregation ----
                s = c.get("score")
                if hs is not None and s is not None and doc_url:
                    diff = abs(hs - s)
                    diff_scores[doc_url].append(diff)
    # ==========================
    # Extra Info Word Counts
-    # ==========================
+    # --------------------------
    st.subheader("Extra Info Label Counts")
    if word_counter:
@@ -385,86 +364,3 @@ elif view == "Statistics":
        st.bar_chart(df_words.set_index("Label"))
    else:
        st.info("No extra_info data available yet.")
    # ==========================
    # Avg Human Score per Document
    # ==========================
    st.subheader("Average Human Score per documentUrl")
    avg_scores = []
    for doc, scores in doc_scores.items():
        if scores:
            avg_scores.append({
                "documentUrl": doc,
                "avg_human_score": sum(scores) / len(scores),
                "num_events": len(scores)
            })
    if avg_scores:
        df_scores = pd.DataFrame(avg_scores).sort_values(
            "avg_human_score",
            ascending=False
        )
        st.dataframe(df_scores)
        # ==========================
        # Distribution (rounded to 0.1)
        # ==========================
        st.subheader("Distribution of Average Human Scores (Rounded to 0.1)")
        # round averages to nearest 0.1
        df_scores["rounded_score"] = (
            df_scores["avg_human_score"].round(1)
        )
        # count how many docs fall into each bucket
        dist = (
            df_scores["rounded_score"]
            .value_counts()
            .sort_index()
            .reset_index()
        )
        dist.columns = ["rounded_score", "count"]
        # ensure all bins from 0.0 → 1.0 exist
        all_bins = pd.DataFrame({
            "rounded_score": [round(x * 0.1, 1) for x in range(11)]
        })
        dist = (
            all_bins.merge(dist, on="rounded_score", how="left")
            .fillna(0)
        )
        dist["count"] = dist["count"].astype(int)
        # plot counts per score bucket
        st.bar_chart(
            dist.set_index("rounded_score")["count"]
        )
    else:
        st.info("No human scores available yet.")
    # ==========================
    # Overall Model vs Human Difference
    # ==========================
    st.subheader("Model vs Human Agreement")
    all_diffs = [
        diff
        for diffs in diff_scores.values()
        for diff in diffs
    ]
    if all_diffs:
        avg_diff = sum(all_diffs) / len(all_diffs)
        st.write(
            f"Average absolute difference between model score and human score: "
            f"**{avg_diff:.3f}**"
        )
    else:
        st.info("No items have both score and human_score yet.")
@@ -1 +1,2 @@
 streamlit
 streamlit-sortables
@@ -15,3 +15,6 @@
 5. Proposed trigger events should be sufficiently different from one another
 6. Proposed trigger events must be free from bias, and backed up by reliable evidence
 Edge case handing:
 In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley