diff --git a/supporting/scorer/display.py b/supporting/scorer/display.py index e53550c..2e6ce31 100644 --- a/supporting/scorer/display.py +++ b/supporting/scorer/display.py @@ -5,17 +5,15 @@ import random from pathlib import Path from collections import Counter, defaultdict import pandas as pd +from streamlit_sortables import sort_items -# Path to your JSONL file INPUT_FILE = "../../data/results.jsonl" OUTPUT_FILE = "../../data/ranked.jsonl" # -------------------------- # Helper functions # -------------------------- - def load_data(file_path): - """Load JSONL file into a list of dicts with parsed content.""" data = [] if Path(file_path).exists(): @@ -25,29 +23,20 @@ def load_data(file_path): continue entry = json.loads(line) - outputs = entry.get("output", []) - # ---- normalize format ---- - # old format: list - # new format: single dict if isinstance(outputs, dict): outputs = [outputs] - # ---- parse content ---- for o in outputs: content = o.get("content") - if content: try: o["content_parsed"] = json.loads(content) except json.JSONDecodeError: o["content_parsed"] = [] - print("parse error") - # optionally store normalized outputs back entry["output"] = outputs - data.append(entry) return data @@ -57,7 +46,6 @@ def save_data_clean(file_path, data): merged = {} for entry in data: - # collect all content_parsed items from this entry events = [] for o in entry.get("output", []): if "content_parsed" in o: @@ -68,31 +56,25 @@ def save_data_clean(file_path, data): continue if doc_url not in merged: - # take the first object's other values new_entry = entry.copy() new_entry["events"] = events - - # remove unwanted fields safely new_entry.pop("output", None) new_entry.pop("status", None) - merged[doc_url] = new_entry else: - # merge events into existing entry merged[doc_url]["events"].extend(events) - # sort events by human_score for entry in merged.values(): entry["events"].sort( key=lambda e: e.get("human_score", 0), - reverse=True # highest score first; remove if you want ascending + reverse=True ) - # write merged results with open(file_path, "w", encoding="utf-8") as f: for entry in merged.values(): f.write(json.dumps(entry, ensure_ascii=False) + "\n") + def save_data(file_path, data): with open(file_path, "w", encoding="utf-8") as f: for entry in data: @@ -129,11 +111,17 @@ st.title("Claim Visualizer") # -------------------------- view = st.sidebar.selectbox( "Choose View", - ["All Claims", "Single Claim Random", "View Rules", "Statistics"] + [ + "All Claims", + "Single Claim Random", + "Rank Perfect Events", + "View Rules", + "Statistics" + ] ) # -------------------------- -# ALL CLAIMS VIEW +# View/AllClaims # -------------------------- if view == "All Claims": st.header("All Claims") @@ -150,51 +138,33 @@ if view == "All Claims": st.markdown("---") # -------------------------- -# SINGLE CLAIM RANDOM VIEW +# View/Annotate # -------------------------- - elif view == "Single Claim Random": - # Select new entry if needed + if st.session_state.current_claim is None: - unscored_entries = [] + unannotated = [] for entry in st.session_state.data: - unscored = [] + claims = [] for o in entry.get("output", []): for c in o.get("content_parsed", []): - if c.get("human_score") is None: - unscored.append(c) + if not c.get("ranked"): + claims.append(c) - if unscored: - # try to find an existing entry with same documentUrl - existing = next( - (item for item in unscored_entries - if item["entry"]["documentUrl"] == entry["documentUrl"]), - None - ) + if claims: + unannotated.append({"entry": entry, "claims": claims}) - if existing: - # append new claims to existing entry - existing["claims"].extend(unscored) - else: - # create new object - unscored_entries.append({ - "entry": entry, - "claims": list(unscored) - }) - - if unscored_entries: - st.session_state.current_claim = random.choice(unscored_entries) + if unannotated: + st.session_state.current_claim = random.choice(unannotated) st.session_state.drag_order = None - else: - st.session_state.current_claim = None bundle = st.session_state.current_claim if bundle is None: - st.info("No entries remaining without human scores.") + st.info("All items annotated.") else: entry = bundle["entry"] claims = bundle["claims"] @@ -202,144 +172,164 @@ elif view == "Single Claim Random": st.subheader(entry.get("text")) st.write(entry.get("normalized", "")) - # -------------------------- - # Stable Drag IDs (FIX) - # -------------------------- - - claim_ids = [str(i) for i in range(len(claims))] - - # Initialize order only once - if ( - st.session_state.drag_order is None - or len(st.session_state.drag_order) != len(claim_ids) - ): - st.session_state.drag_order = claim_ids.copy() - - ordered_indices = [ - int(i) for i in st.session_state.drag_order - ] - - # -------------------------- - # Annotation Section - # -------------------------- - st.subheader("Annotate Events") - for pos, idx in enumerate(ordered_indices): - c = claims[idx] + for idx, c in enumerate(claims): with st.container(border=True): st.markdown(f"**Event:** {c.get('event')}") - st.markdown( - f"**Reasoning:** {c.get('reasoningWhyRelevant')}" - ) + st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}") cols = st.columns(7) - temp = "" - with cols[0]: - a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') ) - temp += "REWORDING " if a else "" + labels = [ + ("Rewording", "REWORDING"), + ("Not Specific", "NSPECIFIC"), + ("Time Incorrect", "TINCORRECT"), + ("Story?", "STORY"), + ("Duplicate?", "DUPLICATE"), + ("Bias Shown", "BIAS"), + ("Perfect", "PERFECT"), + ] - with cols[1]: - a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') ) - temp += "NSPECIFIC " if a else "" + for i, (name, tag) in enumerate(labels): + with cols[i]: + if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"): + temp += tag + " " - with cols[2]: - a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') ) - temp += "TINCORRECT " if a else "" - - with cols[3]: - a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') ) - temp += "STORY " if a else "" - - with cols[4]: - a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') ) - temp += "DUPLICATE " if a else "" - - with cols[5]: - a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') ) - temp += "BIAS " if a else "" - - with cols[6]: - a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') ) - temp += "PERFECT " if a else "" - - c["extra_info"] = temp - - # ---- MOVE BUTTONS ---- - move_cols = st.columns(2) - - with move_cols[0]: - if st.button( - "Up", - key="UP" + str(idx) + c.get("event") - ): - if pos > 0: - order = st.session_state.drag_order - order[pos], order[pos - 1] = order[pos - 1], order[pos] - st.session_state.drag_order = order - st.rerun() - - with move_cols[1]: - if st.button( - "Down", - key="DOWN" + str(idx) + c.get("event") - ): - if pos < len(st.session_state.drag_order) - 1: - order = st.session_state.drag_order - order[pos], order[pos + 1] = order[pos + 1], order[pos] - st.session_state.drag_order = order - st.rerun() - - - # -------------------------- - # Submit Ranking - # -------------------------- - - if st.button("Submit Ranking"): - - n = len(ordered_indices) - - for rank_position, idx in enumerate(ordered_indices): - - claim_obj = claims[idx] - score = 0 - if n == 1: - score = 1.0 - else: - score = 1 - (rank_position / (n - 1)) - - if (claim_obj["extra_info"] != ""): - if (claim_obj["extra_info"].find("PERFECT") != -1): - score = 1 - elif(claim_obj["extra_info"].find("DUPLICATE") != -1): - score = 0 - else: - score *= 0.5 - - - claim_obj["human_score"] = round(score, 3) + c["extra_info"] = temp.strip() + c["ranked"] = True + if st.button("Save Annotation"): save_data(INPUT_FILE, st.session_state.data) - save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data)) - - - print("Ranking converted to scores and saved!") - st.session_state.current_claim = None - st.session_state.drag_order = None - + print("Annotation saved") st.rerun() +# -------------------------- +# View/Rank +# -------------------------- + +elif view == "Rank Perfect Events": + + st.header("Rank PERFECT Events") + candidates = [] + + for entry in st.session_state.data: + perfect = [] + + for o in entry.get("output", []): + for c in o.get("content_parsed", []): + if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"): + perfect.append(c) + + if perfect: + candidates.append({"entry": entry, "claims": perfect}) + + if not candidates: + st.info("No PERFECT events available.") + st.stop() + + if "current_bundle" not in st.session_state: + st.session_state.current_bundle = random.choice(candidates) + + bundle = st.session_state.current_bundle + entry = bundle["entry"] + claims = bundle["claims"] + + st.subheader(entry.get("text")) + + # init + if "perfect_order" not in st.session_state: + st.session_state.perfect_order = list(range(len(claims))) + + order = st.session_state.perfect_order + + # labels shown in sortable UI + labels = [ + f"{i+1}. {claims[idx].get('event')}" + for i, idx in enumerate(order) + ] + + st.markdown("### Drag to reorder:") + + # ------------------------- + # Drag & drop UI + # ------------------------- + new_labels = sort_items(labels) + + # Convert reordered labels back → indices + if new_labels != labels: + new_order = [] + for lbl in new_labels: + original_pos = labels.index(lbl) + new_order.append(order[original_pos]) + + st.session_state.perfect_order = new_order + order = new_order + + st.markdown("---") + for rank, idx in enumerate(order): + c = claims[idx] + st.markdown(f"**Rank {rank+1}: {c.get('event')}**") + st.markdown(c.get("reasoningWhyRelevant")) + st.markdown("---") + + if st.button("Submit PERFECT Ranking"): + + n = len(order) + + for rank_position, idx in enumerate(order): + claim_obj = claims[idx] + + # explicit stored rank + claim_obj["rank_position"] = rank_position + 1 + + claim_obj["human_score"] = 1 + + # Auto-scoring + for entry in st.session_state.data: + for o in entry.get("output", []): + for c in o.get("content_parsed", []): + + if c.get("human_score") is not None: + continue + + extra = c.get("extra_info", "") + + if "DUPLICATE" in extra: + c["human_score"] = 0 + elif extra: + c["human_score"] = round( + c.get("score", 0) * 0.5, 3 + ) + + save_data(INPUT_FILE, st.session_state.data) + save_data_clean( + OUTPUT_FILE, + copy.deepcopy(st.session_state.data) + ) + + # reset state for next example + del st.session_state.current_bundle + del st.session_state.perfect_order + + print("Ranking saved!") + st.rerun() + +# -------------------------- +# View/Rules +# -------------------------- elif view == "View Rules": with open("rules.txt", "r", encoding="utf-8") as f: st.write(f.read()) +# -------------------------- +# View/Statistics +# -------------------------- elif view == "Statistics": - st.header("Statistics") word_counter = Counter() @@ -359,20 +349,9 @@ elif view == "Statistics": words = extra.strip().split() word_counter.update(words) - # ---- human score aggregation ---- - hs = c.get("human_score") - if hs is not None and doc_url: - doc_scores[doc_url].append(hs) - - # ---- diff score aggregation ---- - s = c.get("score") - if hs is not None and s is not None and doc_url: - diff = abs(hs - s) - diff_scores[doc_url].append(diff) - - # ========================== + # -------------------------- # Extra Info Word Counts - # ========================== + # -------------------------- st.subheader("Extra Info Label Counts") if word_counter: @@ -384,87 +363,4 @@ elif view == "Statistics": st.dataframe(df_words) st.bar_chart(df_words.set_index("Label")) else: - st.info("No extra_info data available yet.") - - # ========================== - # Avg Human Score per Document - # ========================== - st.subheader("Average Human Score per documentUrl") - - avg_scores = [] - - for doc, scores in doc_scores.items(): - if scores: - avg_scores.append({ - "documentUrl": doc, - "avg_human_score": sum(scores) / len(scores), - "num_events": len(scores) - }) - - if avg_scores: - df_scores = pd.DataFrame(avg_scores).sort_values( - "avg_human_score", - ascending=False - ) - - st.dataframe(df_scores) - # ========================== - # Distribution (rounded to 0.1) - # ========================== - - st.subheader("Distribution of Average Human Scores (Rounded to 0.1)") - - # round averages to nearest 0.1 - df_scores["rounded_score"] = ( - df_scores["avg_human_score"].round(1) - ) - - # count how many docs fall into each bucket - dist = ( - df_scores["rounded_score"] - .value_counts() - .sort_index() - .reset_index() - ) - - dist.columns = ["rounded_score", "count"] - - # ensure all bins from 0.0 → 1.0 exist - all_bins = pd.DataFrame({ - "rounded_score": [round(x * 0.1, 1) for x in range(11)] - }) - - dist = ( - all_bins.merge(dist, on="rounded_score", how="left") - .fillna(0) - ) - - dist["count"] = dist["count"].astype(int) - - # plot counts per score bucket - st.bar_chart( - dist.set_index("rounded_score")["count"] - ) - else: - st.info("No human scores available yet.") - - # ========================== - # Overall Model vs Human Difference - # ========================== - st.subheader("Model vs Human Agreement") - - all_diffs = [ - diff - for diffs in diff_scores.values() - for diff in diffs - ] - - if all_diffs: - avg_diff = sum(all_diffs) / len(all_diffs) - - st.write( - f"Average absolute difference between model score and human score: " - f"**{avg_diff:.3f}**" - ) - else: - st.info("No items have both score and human_score yet.") \ No newline at end of file + st.info("No extra_info data available yet.") \ No newline at end of file diff --git a/supporting/scorer/requirements.txt b/supporting/scorer/requirements.txt index e251330..6c96522 100644 --- a/supporting/scorer/requirements.txt +++ b/supporting/scorer/requirements.txt @@ -1 +1,2 @@ -streamlit \ No newline at end of file +streamlit +streamlit-sortables \ No newline at end of file diff --git a/supporting/scorer/rules.txt b/supporting/scorer/rules.txt index 3f3aca8..0291a22 100644 --- a/supporting/scorer/rules.txt +++ b/supporting/scorer/rules.txt @@ -14,4 +14,7 @@ 5. Proposed trigger events should be sufficiently different from one another -6. Proposed trigger events must be free from bias, and backed up by reliable evidence \ No newline at end of file +6. Proposed trigger events must be free from bias, and backed up by reliable evidence + +Edge case handing: +In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley \ No newline at end of file