Update how scoring works with two passes of the data for timesaving. Add section on edge case handling to rules.

This commit is contained in:
William Jeynes
2026-02-26 10:09:36 +00:00
parent 8317fd85df
commit 6c3aa7343d
3 changed files with 167 additions and 267 deletions
+161 -265
View File
@@ -5,17 +5,15 @@ import random
from pathlib import Path
from collections import Counter, defaultdict
import pandas as pd
from streamlit_sortables import sort_items
# Path to your JSONL file
INPUT_FILE = "../../data/results.jsonl"
OUTPUT_FILE = "../../data/ranked.jsonl"
# --------------------------
# Helper functions
# --------------------------
def load_data(file_path):
"""Load JSONL file into a list of dicts with parsed content."""
data = []
if Path(file_path).exists():
@@ -25,29 +23,20 @@ def load_data(file_path):
continue
entry = json.loads(line)
outputs = entry.get("output", [])
# ---- normalize format ----
# old format: list
# new format: single dict
if isinstance(outputs, dict):
outputs = [outputs]
# ---- parse content ----
for o in outputs:
content = o.get("content")
if content:
try:
o["content_parsed"] = json.loads(content)
except json.JSONDecodeError:
o["content_parsed"] = []
print("parse error")
# optionally store normalized outputs back
entry["output"] = outputs
data.append(entry)
return data
@@ -57,7 +46,6 @@ def save_data_clean(file_path, data):
merged = {}
for entry in data:
# collect all content_parsed items from this entry
events = []
for o in entry.get("output", []):
if "content_parsed" in o:
@@ -68,31 +56,25 @@ def save_data_clean(file_path, data):
continue
if doc_url not in merged:
# take the first object's other values
new_entry = entry.copy()
new_entry["events"] = events
# remove unwanted fields safely
new_entry.pop("output", None)
new_entry.pop("status", None)
merged[doc_url] = new_entry
else:
# merge events into existing entry
merged[doc_url]["events"].extend(events)
# sort events by human_score
for entry in merged.values():
entry["events"].sort(
key=lambda e: e.get("human_score", 0),
reverse=True # highest score first; remove if you want ascending
reverse=True
)
# write merged results
with open(file_path, "w", encoding="utf-8") as f:
for entry in merged.values():
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
def save_data(file_path, data):
with open(file_path, "w", encoding="utf-8") as f:
for entry in data:
@@ -129,11 +111,17 @@ st.title("Claim Visualizer")
# --------------------------
view = st.sidebar.selectbox(
"Choose View",
["All Claims", "Single Claim Random", "View Rules", "Statistics"]
[
"All Claims",
"Single Claim Random",
"Rank Perfect Events",
"View Rules",
"Statistics"
]
)
# --------------------------
# ALL CLAIMS VIEW
# View/AllClaims
# --------------------------
if view == "All Claims":
st.header("All Claims")
@@ -150,51 +138,33 @@ if view == "All Claims":
st.markdown("---")
# --------------------------
# SINGLE CLAIM RANDOM VIEW
# View/Annotate
# --------------------------
elif view == "Single Claim Random":
# Select new entry if needed
if st.session_state.current_claim is None:
unscored_entries = []
unannotated = []
for entry in st.session_state.data:
unscored = []
claims = []
for o in entry.get("output", []):
for c in o.get("content_parsed", []):
if c.get("human_score") is None:
unscored.append(c)
if not c.get("ranked"):
claims.append(c)
if unscored:
# try to find an existing entry with same documentUrl
existing = next(
(item for item in unscored_entries
if item["entry"]["documentUrl"] == entry["documentUrl"]),
None
)
if claims:
unannotated.append({"entry": entry, "claims": claims})
if existing:
# append new claims to existing entry
existing["claims"].extend(unscored)
else:
# create new object
unscored_entries.append({
"entry": entry,
"claims": list(unscored)
})
if unscored_entries:
st.session_state.current_claim = random.choice(unscored_entries)
if unannotated:
st.session_state.current_claim = random.choice(unannotated)
st.session_state.drag_order = None
else:
st.session_state.current_claim = None
bundle = st.session_state.current_claim
if bundle is None:
st.info("No entries remaining without human scores.")
st.info("All items annotated.")
else:
entry = bundle["entry"]
claims = bundle["claims"]
@@ -202,144 +172,164 @@ elif view == "Single Claim Random":
st.subheader(entry.get("text"))
st.write(entry.get("normalized", ""))
# --------------------------
# Stable Drag IDs (FIX)
# --------------------------
claim_ids = [str(i) for i in range(len(claims))]
# Initialize order only once
if (
st.session_state.drag_order is None
or len(st.session_state.drag_order) != len(claim_ids)
):
st.session_state.drag_order = claim_ids.copy()
ordered_indices = [
int(i) for i in st.session_state.drag_order
]
# --------------------------
# Annotation Section
# --------------------------
st.subheader("Annotate Events")
for pos, idx in enumerate(ordered_indices):
c = claims[idx]
for idx, c in enumerate(claims):
with st.container(border=True):
st.markdown(f"**Event:** {c.get('event')}")
st.markdown(
f"**Reasoning:** {c.get('reasoningWhyRelevant')}"
)
st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}")
cols = st.columns(7)
temp = ""
with cols[0]:
a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') )
temp += "REWORDING " if a else ""
labels = [
("Rewording", "REWORDING"),
("Not Specific", "NSPECIFIC"),
("Time Incorrect", "TINCORRECT"),
("Story?", "STORY"),
("Duplicate?", "DUPLICATE"),
("Bias Shown", "BIAS"),
("Perfect", "PERFECT"),
]
with cols[1]:
a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') )
temp += "NSPECIFIC " if a else ""
for i, (name, tag) in enumerate(labels):
with cols[i]:
if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"):
temp += tag + " "
with cols[2]:
a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') )
temp += "TINCORRECT " if a else ""
with cols[3]:
a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') )
temp += "STORY " if a else ""
with cols[4]:
a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') )
temp += "DUPLICATE " if a else ""
with cols[5]:
a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') )
temp += "BIAS " if a else ""
with cols[6]:
a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') )
temp += "PERFECT " if a else ""
c["extra_info"] = temp
# ---- MOVE BUTTONS ----
move_cols = st.columns(2)
with move_cols[0]:
if st.button(
"Up",
key="UP" + str(idx) + c.get("event")
):
if pos > 0:
order = st.session_state.drag_order
order[pos], order[pos - 1] = order[pos - 1], order[pos]
st.session_state.drag_order = order
st.rerun()
with move_cols[1]:
if st.button(
"Down",
key="DOWN" + str(idx) + c.get("event")
):
if pos < len(st.session_state.drag_order) - 1:
order = st.session_state.drag_order
order[pos], order[pos + 1] = order[pos + 1], order[pos]
st.session_state.drag_order = order
st.rerun()
# --------------------------
# Submit Ranking
# --------------------------
if st.button("Submit Ranking"):
n = len(ordered_indices)
for rank_position, idx in enumerate(ordered_indices):
claim_obj = claims[idx]
score = 0
if n == 1:
score = 1.0
else:
score = 1 - (rank_position / (n - 1))
if (claim_obj["extra_info"] != ""):
if (claim_obj["extra_info"].find("PERFECT") != -1):
score = 1
elif(claim_obj["extra_info"].find("DUPLICATE") != -1):
score = 0
else:
score *= 0.5
claim_obj["human_score"] = round(score, 3)
c["extra_info"] = temp.strip()
c["ranked"] = True
if st.button("Save Annotation"):
save_data(INPUT_FILE, st.session_state.data)
save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
print("Ranking converted to scores and saved!")
st.session_state.current_claim = None
st.session_state.drag_order = None
print("Annotation saved")
st.rerun()
# --------------------------
# View/Rank
# --------------------------
elif view == "Rank Perfect Events":
st.header("Rank PERFECT Events")
candidates = []
for entry in st.session_state.data:
perfect = []
for o in entry.get("output", []):
for c in o.get("content_parsed", []):
if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"):
perfect.append(c)
if perfect:
candidates.append({"entry": entry, "claims": perfect})
if not candidates:
st.info("No PERFECT events available.")
st.stop()
if "current_bundle" not in st.session_state:
st.session_state.current_bundle = random.choice(candidates)
bundle = st.session_state.current_bundle
entry = bundle["entry"]
claims = bundle["claims"]
st.subheader(entry.get("text"))
# init
if "perfect_order" not in st.session_state:
st.session_state.perfect_order = list(range(len(claims)))
order = st.session_state.perfect_order
# labels shown in sortable UI
labels = [
f"{i+1}. {claims[idx].get('event')}"
for i, idx in enumerate(order)
]
st.markdown("### Drag to reorder:")
# -------------------------
# Drag & drop UI
# -------------------------
new_labels = sort_items(labels)
# Convert reordered labels back → indices
if new_labels != labels:
new_order = []
for lbl in new_labels:
original_pos = labels.index(lbl)
new_order.append(order[original_pos])
st.session_state.perfect_order = new_order
order = new_order
st.markdown("---")
for rank, idx in enumerate(order):
c = claims[idx]
st.markdown(f"**Rank {rank+1}: {c.get('event')}**")
st.markdown(c.get("reasoningWhyRelevant"))
st.markdown("---")
if st.button("Submit PERFECT Ranking"):
n = len(order)
for rank_position, idx in enumerate(order):
claim_obj = claims[idx]
# explicit stored rank
claim_obj["rank_position"] = rank_position + 1
claim_obj["human_score"] = 1
# Auto-scoring
for entry in st.session_state.data:
for o in entry.get("output", []):
for c in o.get("content_parsed", []):
if c.get("human_score") is not None:
continue
extra = c.get("extra_info", "")
if "DUPLICATE" in extra:
c["human_score"] = 0
elif extra:
c["human_score"] = round(
c.get("score", 0) * 0.5, 3
)
save_data(INPUT_FILE, st.session_state.data)
save_data_clean(
OUTPUT_FILE,
copy.deepcopy(st.session_state.data)
)
# reset state for next example
del st.session_state.current_bundle
del st.session_state.perfect_order
print("Ranking saved!")
st.rerun()
# --------------------------
# View/Rules
# --------------------------
elif view == "View Rules":
with open("rules.txt", "r", encoding="utf-8") as f:
st.write(f.read())
# --------------------------
# View/Statistics
# --------------------------
elif view == "Statistics":
st.header("Statistics")
word_counter = Counter()
@@ -359,20 +349,9 @@ elif view == "Statistics":
words = extra.strip().split()
word_counter.update(words)
# ---- human score aggregation ----
hs = c.get("human_score")
if hs is not None and doc_url:
doc_scores[doc_url].append(hs)
# ---- diff score aggregation ----
s = c.get("score")
if hs is not None and s is not None and doc_url:
diff = abs(hs - s)
diff_scores[doc_url].append(diff)
# ==========================
# --------------------------
# Extra Info Word Counts
# ==========================
# --------------------------
st.subheader("Extra Info Label Counts")
if word_counter:
@@ -384,87 +363,4 @@ elif view == "Statistics":
st.dataframe(df_words)
st.bar_chart(df_words.set_index("Label"))
else:
st.info("No extra_info data available yet.")
# ==========================
# Avg Human Score per Document
# ==========================
st.subheader("Average Human Score per documentUrl")
avg_scores = []
for doc, scores in doc_scores.items():
if scores:
avg_scores.append({
"documentUrl": doc,
"avg_human_score": sum(scores) / len(scores),
"num_events": len(scores)
})
if avg_scores:
df_scores = pd.DataFrame(avg_scores).sort_values(
"avg_human_score",
ascending=False
)
st.dataframe(df_scores)
# ==========================
# Distribution (rounded to 0.1)
# ==========================
st.subheader("Distribution of Average Human Scores (Rounded to 0.1)")
# round averages to nearest 0.1
df_scores["rounded_score"] = (
df_scores["avg_human_score"].round(1)
)
# count how many docs fall into each bucket
dist = (
df_scores["rounded_score"]
.value_counts()
.sort_index()
.reset_index()
)
dist.columns = ["rounded_score", "count"]
# ensure all bins from 0.0 → 1.0 exist
all_bins = pd.DataFrame({
"rounded_score": [round(x * 0.1, 1) for x in range(11)]
})
dist = (
all_bins.merge(dist, on="rounded_score", how="left")
.fillna(0)
)
dist["count"] = dist["count"].astype(int)
# plot counts per score bucket
st.bar_chart(
dist.set_index("rounded_score")["count"]
)
else:
st.info("No human scores available yet.")
# ==========================
# Overall Model vs Human Difference
# ==========================
st.subheader("Model vs Human Agreement")
all_diffs = [
diff
for diffs in diff_scores.values()
for diff in diffs
]
if all_diffs:
avg_diff = sum(all_diffs) / len(all_diffs)
st.write(
f"Average absolute difference between model score and human score: "
f"**{avg_diff:.3f}**"
)
else:
st.info("No items have both score and human_score yet.")
st.info("No extra_info data available yet.")
+2 -1
View File
@@ -1 +1,2 @@
streamlit
streamlit
streamlit-sortables
+4 -1
View File
@@ -14,4 +14,7 @@
5. Proposed trigger events should be sufficiently different from one another
6. Proposed trigger events must be free from bias, and backed up by reliable evidence
6. Proposed trigger events must be free from bias, and backed up by reliable evidence
Edge case handing:
In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley