Update how scoring works with two passes of the data for timesaving. Add section on edge case handling to rules.

This commit is contained in:
William Jeynes
2026-02-26 10:09:36 +00:00
parent 8317fd85df
commit 6c3aa7343d
3 changed files with 167 additions and 267 deletions
+146 -250
View File
@@ -5,17 +5,15 @@ import random
from pathlib import Path from pathlib import Path
from collections import Counter, defaultdict from collections import Counter, defaultdict
import pandas as pd import pandas as pd
from streamlit_sortables import sort_items
# Path to your JSONL file
INPUT_FILE = "../../data/results.jsonl" INPUT_FILE = "../../data/results.jsonl"
OUTPUT_FILE = "../../data/ranked.jsonl" OUTPUT_FILE = "../../data/ranked.jsonl"
# -------------------------- # --------------------------
# Helper functions # Helper functions
# -------------------------- # --------------------------
def load_data(file_path): def load_data(file_path):
"""Load JSONL file into a list of dicts with parsed content."""
data = [] data = []
if Path(file_path).exists(): if Path(file_path).exists():
@@ -25,29 +23,20 @@ def load_data(file_path):
continue continue
entry = json.loads(line) entry = json.loads(line)
outputs = entry.get("output", []) outputs = entry.get("output", [])
# ---- normalize format ----
# old format: list
# new format: single dict
if isinstance(outputs, dict): if isinstance(outputs, dict):
outputs = [outputs] outputs = [outputs]
# ---- parse content ----
for o in outputs: for o in outputs:
content = o.get("content") content = o.get("content")
if content: if content:
try: try:
o["content_parsed"] = json.loads(content) o["content_parsed"] = json.loads(content)
except json.JSONDecodeError: except json.JSONDecodeError:
o["content_parsed"] = [] o["content_parsed"] = []
print("parse error")
# optionally store normalized outputs back
entry["output"] = outputs entry["output"] = outputs
data.append(entry) data.append(entry)
return data return data
@@ -57,7 +46,6 @@ def save_data_clean(file_path, data):
merged = {} merged = {}
for entry in data: for entry in data:
# collect all content_parsed items from this entry
events = [] events = []
for o in entry.get("output", []): for o in entry.get("output", []):
if "content_parsed" in o: if "content_parsed" in o:
@@ -68,31 +56,25 @@ def save_data_clean(file_path, data):
continue continue
if doc_url not in merged: if doc_url not in merged:
# take the first object's other values
new_entry = entry.copy() new_entry = entry.copy()
new_entry["events"] = events new_entry["events"] = events
# remove unwanted fields safely
new_entry.pop("output", None) new_entry.pop("output", None)
new_entry.pop("status", None) new_entry.pop("status", None)
merged[doc_url] = new_entry merged[doc_url] = new_entry
else: else:
# merge events into existing entry
merged[doc_url]["events"].extend(events) merged[doc_url]["events"].extend(events)
# sort events by human_score
for entry in merged.values(): for entry in merged.values():
entry["events"].sort( entry["events"].sort(
key=lambda e: e.get("human_score", 0), key=lambda e: e.get("human_score", 0),
reverse=True # highest score first; remove if you want ascending reverse=True
) )
# write merged results
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
for entry in merged.values(): for entry in merged.values():
f.write(json.dumps(entry, ensure_ascii=False) + "\n") f.write(json.dumps(entry, ensure_ascii=False) + "\n")
def save_data(file_path, data): def save_data(file_path, data):
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
for entry in data: for entry in data:
@@ -129,11 +111,17 @@ st.title("Claim Visualizer")
# -------------------------- # --------------------------
view = st.sidebar.selectbox( view = st.sidebar.selectbox(
"Choose View", "Choose View",
["All Claims", "Single Claim Random", "View Rules", "Statistics"] [
"All Claims",
"Single Claim Random",
"Rank Perfect Events",
"View Rules",
"Statistics"
]
) )
# -------------------------- # --------------------------
# ALL CLAIMS VIEW # View/AllClaims
# -------------------------- # --------------------------
if view == "All Claims": if view == "All Claims":
st.header("All Claims") st.header("All Claims")
@@ -150,51 +138,33 @@ if view == "All Claims":
st.markdown("---") st.markdown("---")
# -------------------------- # --------------------------
# SINGLE CLAIM RANDOM VIEW # View/Annotate
# -------------------------- # --------------------------
elif view == "Single Claim Random": elif view == "Single Claim Random":
# Select new entry if needed
if st.session_state.current_claim is None: if st.session_state.current_claim is None:
unscored_entries = [] unannotated = []
for entry in st.session_state.data: for entry in st.session_state.data:
unscored = [] claims = []
for o in entry.get("output", []): for o in entry.get("output", []):
for c in o.get("content_parsed", []): for c in o.get("content_parsed", []):
if c.get("human_score") is None: if not c.get("ranked"):
unscored.append(c) claims.append(c)
if unscored: if claims:
# try to find an existing entry with same documentUrl unannotated.append({"entry": entry, "claims": claims})
existing = next(
(item for item in unscored_entries
if item["entry"]["documentUrl"] == entry["documentUrl"]),
None
)
if existing: if unannotated:
# append new claims to existing entry st.session_state.current_claim = random.choice(unannotated)
existing["claims"].extend(unscored)
else:
# create new object
unscored_entries.append({
"entry": entry,
"claims": list(unscored)
})
if unscored_entries:
st.session_state.current_claim = random.choice(unscored_entries)
st.session_state.drag_order = None st.session_state.drag_order = None
else:
st.session_state.current_claim = None
bundle = st.session_state.current_claim bundle = st.session_state.current_claim
if bundle is None: if bundle is None:
st.info("No entries remaining without human scores.") st.info("All items annotated.")
else: else:
entry = bundle["entry"] entry = bundle["entry"]
claims = bundle["claims"] claims = bundle["claims"]
@@ -202,144 +172,164 @@ elif view == "Single Claim Random":
st.subheader(entry.get("text")) st.subheader(entry.get("text"))
st.write(entry.get("normalized", "")) st.write(entry.get("normalized", ""))
# --------------------------
# Stable Drag IDs (FIX)
# --------------------------
claim_ids = [str(i) for i in range(len(claims))]
# Initialize order only once
if (
st.session_state.drag_order is None
or len(st.session_state.drag_order) != len(claim_ids)
):
st.session_state.drag_order = claim_ids.copy()
ordered_indices = [
int(i) for i in st.session_state.drag_order
]
# --------------------------
# Annotation Section
# --------------------------
st.subheader("Annotate Events") st.subheader("Annotate Events")
for pos, idx in enumerate(ordered_indices): for idx, c in enumerate(claims):
c = claims[idx]
with st.container(border=True): with st.container(border=True):
st.markdown(f"**Event:** {c.get('event')}") st.markdown(f"**Event:** {c.get('event')}")
st.markdown( st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}")
f"**Reasoning:** {c.get('reasoningWhyRelevant')}"
)
cols = st.columns(7) cols = st.columns(7)
temp = "" temp = ""
with cols[0]: labels = [
a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') ) ("Rewording", "REWORDING"),
temp += "REWORDING " if a else "" ("Not Specific", "NSPECIFIC"),
("Time Incorrect", "TINCORRECT"),
("Story?", "STORY"),
("Duplicate?", "DUPLICATE"),
("Bias Shown", "BIAS"),
("Perfect", "PERFECT"),
]
with cols[1]: for i, (name, tag) in enumerate(labels):
a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') ) with cols[i]:
temp += "NSPECIFIC " if a else "" if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"):
temp += tag + " "
with cols[2]: c["extra_info"] = temp.strip()
a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') ) c["ranked"] = True
temp += "TINCORRECT " if a else ""
with cols[3]: if st.button("Save Annotation"):
a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') ) save_data(INPUT_FILE, st.session_state.data)
temp += "STORY " if a else "" st.session_state.current_claim = None
print("Annotation saved")
with cols[4]:
a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') )
temp += "DUPLICATE " if a else ""
with cols[5]:
a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') )
temp += "BIAS " if a else ""
with cols[6]:
a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') )
temp += "PERFECT " if a else ""
c["extra_info"] = temp
# ---- MOVE BUTTONS ----
move_cols = st.columns(2)
with move_cols[0]:
if st.button(
"Up",
key="UP" + str(idx) + c.get("event")
):
if pos > 0:
order = st.session_state.drag_order
order[pos], order[pos - 1] = order[pos - 1], order[pos]
st.session_state.drag_order = order
st.rerun() st.rerun()
with move_cols[1]:
if st.button(
"Down",
key="DOWN" + str(idx) + c.get("event")
):
if pos < len(st.session_state.drag_order) - 1:
order = st.session_state.drag_order
order[pos], order[pos + 1] = order[pos + 1], order[pos]
st.session_state.drag_order = order
st.rerun()
# -------------------------- # --------------------------
# Submit Ranking # View/Rank
# -------------------------- # --------------------------
if st.button("Submit Ranking"): elif view == "Rank Perfect Events":
n = len(ordered_indices) st.header("Rank PERFECT Events")
candidates = []
for rank_position, idx in enumerate(ordered_indices): for entry in st.session_state.data:
perfect = []
for o in entry.get("output", []):
for c in o.get("content_parsed", []):
if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"):
perfect.append(c)
if perfect:
candidates.append({"entry": entry, "claims": perfect})
if not candidates:
st.info("No PERFECT events available.")
st.stop()
if "current_bundle" not in st.session_state:
st.session_state.current_bundle = random.choice(candidates)
bundle = st.session_state.current_bundle
entry = bundle["entry"]
claims = bundle["claims"]
st.subheader(entry.get("text"))
# init
if "perfect_order" not in st.session_state:
st.session_state.perfect_order = list(range(len(claims)))
order = st.session_state.perfect_order
# labels shown in sortable UI
labels = [
f"{i+1}. {claims[idx].get('event')}"
for i, idx in enumerate(order)
]
st.markdown("### Drag to reorder:")
# -------------------------
# Drag & drop UI
# -------------------------
new_labels = sort_items(labels)
# Convert reordered labels back → indices
if new_labels != labels:
new_order = []
for lbl in new_labels:
original_pos = labels.index(lbl)
new_order.append(order[original_pos])
st.session_state.perfect_order = new_order
order = new_order
st.markdown("---")
for rank, idx in enumerate(order):
c = claims[idx]
st.markdown(f"**Rank {rank+1}: {c.get('event')}**")
st.markdown(c.get("reasoningWhyRelevant"))
st.markdown("---")
if st.button("Submit PERFECT Ranking"):
n = len(order)
for rank_position, idx in enumerate(order):
claim_obj = claims[idx] claim_obj = claims[idx]
score = 0
if n == 1:
score = 1.0
else:
score = 1 - (rank_position / (n - 1))
if (claim_obj["extra_info"] != ""): # explicit stored rank
if (claim_obj["extra_info"].find("PERFECT") != -1): claim_obj["rank_position"] = rank_position + 1
score = 1
elif(claim_obj["extra_info"].find("DUPLICATE") != -1):
score = 0
else:
score *= 0.5
claim_obj["human_score"] = 1
claim_obj["human_score"] = round(score, 3) # Auto-scoring
for entry in st.session_state.data:
for o in entry.get("output", []):
for c in o.get("content_parsed", []):
if c.get("human_score") is not None:
continue
extra = c.get("extra_info", "")
if "DUPLICATE" in extra:
c["human_score"] = 0
elif extra:
c["human_score"] = round(
c.get("score", 0) * 0.5, 3
)
save_data(INPUT_FILE, st.session_state.data) save_data(INPUT_FILE, st.session_state.data)
save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data)) save_data_clean(
OUTPUT_FILE,
copy.deepcopy(st.session_state.data)
)
# reset state for next example
del st.session_state.current_bundle
del st.session_state.perfect_order
print("Ranking converted to scores and saved!") print("Ranking saved!")
st.session_state.current_claim = None
st.session_state.drag_order = None
st.rerun() st.rerun()
# --------------------------
# View/Rules
# --------------------------
elif view == "View Rules": elif view == "View Rules":
with open("rules.txt", "r", encoding="utf-8") as f: with open("rules.txt", "r", encoding="utf-8") as f:
st.write(f.read()) st.write(f.read())
# --------------------------
# View/Statistics
# --------------------------
elif view == "Statistics": elif view == "Statistics":
st.header("Statistics") st.header("Statistics")
word_counter = Counter() word_counter = Counter()
@@ -359,20 +349,9 @@ elif view == "Statistics":
words = extra.strip().split() words = extra.strip().split()
word_counter.update(words) word_counter.update(words)
# ---- human score aggregation ---- # --------------------------
hs = c.get("human_score")
if hs is not None and doc_url:
doc_scores[doc_url].append(hs)
# ---- diff score aggregation ----
s = c.get("score")
if hs is not None and s is not None and doc_url:
diff = abs(hs - s)
diff_scores[doc_url].append(diff)
# ==========================
# Extra Info Word Counts # Extra Info Word Counts
# ========================== # --------------------------
st.subheader("Extra Info Label Counts") st.subheader("Extra Info Label Counts")
if word_counter: if word_counter:
@@ -385,86 +364,3 @@ elif view == "Statistics":
st.bar_chart(df_words.set_index("Label")) st.bar_chart(df_words.set_index("Label"))
else: else:
st.info("No extra_info data available yet.") st.info("No extra_info data available yet.")
# ==========================
# Avg Human Score per Document
# ==========================
st.subheader("Average Human Score per documentUrl")
avg_scores = []
for doc, scores in doc_scores.items():
if scores:
avg_scores.append({
"documentUrl": doc,
"avg_human_score": sum(scores) / len(scores),
"num_events": len(scores)
})
if avg_scores:
df_scores = pd.DataFrame(avg_scores).sort_values(
"avg_human_score",
ascending=False
)
st.dataframe(df_scores)
# ==========================
# Distribution (rounded to 0.1)
# ==========================
st.subheader("Distribution of Average Human Scores (Rounded to 0.1)")
# round averages to nearest 0.1
df_scores["rounded_score"] = (
df_scores["avg_human_score"].round(1)
)
# count how many docs fall into each bucket
dist = (
df_scores["rounded_score"]
.value_counts()
.sort_index()
.reset_index()
)
dist.columns = ["rounded_score", "count"]
# ensure all bins from 0.0 → 1.0 exist
all_bins = pd.DataFrame({
"rounded_score": [round(x * 0.1, 1) for x in range(11)]
})
dist = (
all_bins.merge(dist, on="rounded_score", how="left")
.fillna(0)
)
dist["count"] = dist["count"].astype(int)
# plot counts per score bucket
st.bar_chart(
dist.set_index("rounded_score")["count"]
)
else:
st.info("No human scores available yet.")
# ==========================
# Overall Model vs Human Difference
# ==========================
st.subheader("Model vs Human Agreement")
all_diffs = [
diff
for diffs in diff_scores.values()
for diff in diffs
]
if all_diffs:
avg_diff = sum(all_diffs) / len(all_diffs)
st.write(
f"Average absolute difference between model score and human score: "
f"**{avg_diff:.3f}**"
)
else:
st.info("No items have both score and human_score yet.")
+1
View File
@@ -1 +1,2 @@
streamlit streamlit
streamlit-sortables
+3
View File
@@ -15,3 +15,6 @@
5. Proposed trigger events should be sufficiently different from one another 5. Proposed trigger events should be sufficiently different from one another
6. Proposed trigger events must be free from bias, and backed up by reliable evidence 6. Proposed trigger events must be free from bias, and backed up by reliable evidence
Edge case handing:
In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley