Update how scoring works with two passes of the data for timesaving. Add section on edge case handling to rules.
This commit is contained in:
+146
-250
@@ -5,17 +5,15 @@ import random
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter, defaultdict
|
from collections import Counter, defaultdict
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from streamlit_sortables import sort_items
|
||||||
|
|
||||||
# Path to your JSONL file
|
|
||||||
INPUT_FILE = "../../data/results.jsonl"
|
INPUT_FILE = "../../data/results.jsonl"
|
||||||
OUTPUT_FILE = "../../data/ranked.jsonl"
|
OUTPUT_FILE = "../../data/ranked.jsonl"
|
||||||
|
|
||||||
# --------------------------
|
# --------------------------
|
||||||
# Helper functions
|
# Helper functions
|
||||||
# --------------------------
|
# --------------------------
|
||||||
|
|
||||||
def load_data(file_path):
|
def load_data(file_path):
|
||||||
"""Load JSONL file into a list of dicts with parsed content."""
|
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
if Path(file_path).exists():
|
if Path(file_path).exists():
|
||||||
@@ -25,29 +23,20 @@ def load_data(file_path):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
entry = json.loads(line)
|
entry = json.loads(line)
|
||||||
|
|
||||||
outputs = entry.get("output", [])
|
outputs = entry.get("output", [])
|
||||||
|
|
||||||
# ---- normalize format ----
|
|
||||||
# old format: list
|
|
||||||
# new format: single dict
|
|
||||||
if isinstance(outputs, dict):
|
if isinstance(outputs, dict):
|
||||||
outputs = [outputs]
|
outputs = [outputs]
|
||||||
|
|
||||||
# ---- parse content ----
|
|
||||||
for o in outputs:
|
for o in outputs:
|
||||||
content = o.get("content")
|
content = o.get("content")
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
try:
|
try:
|
||||||
o["content_parsed"] = json.loads(content)
|
o["content_parsed"] = json.loads(content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
o["content_parsed"] = []
|
o["content_parsed"] = []
|
||||||
print("parse error")
|
|
||||||
|
|
||||||
# optionally store normalized outputs back
|
|
||||||
entry["output"] = outputs
|
entry["output"] = outputs
|
||||||
|
|
||||||
data.append(entry)
|
data.append(entry)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
@@ -57,7 +46,6 @@ def save_data_clean(file_path, data):
|
|||||||
merged = {}
|
merged = {}
|
||||||
|
|
||||||
for entry in data:
|
for entry in data:
|
||||||
# collect all content_parsed items from this entry
|
|
||||||
events = []
|
events = []
|
||||||
for o in entry.get("output", []):
|
for o in entry.get("output", []):
|
||||||
if "content_parsed" in o:
|
if "content_parsed" in o:
|
||||||
@@ -68,31 +56,25 @@ def save_data_clean(file_path, data):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if doc_url not in merged:
|
if doc_url not in merged:
|
||||||
# take the first object's other values
|
|
||||||
new_entry = entry.copy()
|
new_entry = entry.copy()
|
||||||
new_entry["events"] = events
|
new_entry["events"] = events
|
||||||
|
|
||||||
# remove unwanted fields safely
|
|
||||||
new_entry.pop("output", None)
|
new_entry.pop("output", None)
|
||||||
new_entry.pop("status", None)
|
new_entry.pop("status", None)
|
||||||
|
|
||||||
merged[doc_url] = new_entry
|
merged[doc_url] = new_entry
|
||||||
else:
|
else:
|
||||||
# merge events into existing entry
|
|
||||||
merged[doc_url]["events"].extend(events)
|
merged[doc_url]["events"].extend(events)
|
||||||
|
|
||||||
# sort events by human_score
|
|
||||||
for entry in merged.values():
|
for entry in merged.values():
|
||||||
entry["events"].sort(
|
entry["events"].sort(
|
||||||
key=lambda e: e.get("human_score", 0),
|
key=lambda e: e.get("human_score", 0),
|
||||||
reverse=True # highest score first; remove if you want ascending
|
reverse=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# write merged results
|
|
||||||
with open(file_path, "w", encoding="utf-8") as f:
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
for entry in merged.values():
|
for entry in merged.values():
|
||||||
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|
||||||
def save_data(file_path, data):
|
def save_data(file_path, data):
|
||||||
with open(file_path, "w", encoding="utf-8") as f:
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
for entry in data:
|
for entry in data:
|
||||||
@@ -129,11 +111,17 @@ st.title("Claim Visualizer")
|
|||||||
# --------------------------
|
# --------------------------
|
||||||
view = st.sidebar.selectbox(
|
view = st.sidebar.selectbox(
|
||||||
"Choose View",
|
"Choose View",
|
||||||
["All Claims", "Single Claim Random", "View Rules", "Statistics"]
|
[
|
||||||
|
"All Claims",
|
||||||
|
"Single Claim Random",
|
||||||
|
"Rank Perfect Events",
|
||||||
|
"View Rules",
|
||||||
|
"Statistics"
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# --------------------------
|
# --------------------------
|
||||||
# ALL CLAIMS VIEW
|
# View/AllClaims
|
||||||
# --------------------------
|
# --------------------------
|
||||||
if view == "All Claims":
|
if view == "All Claims":
|
||||||
st.header("All Claims")
|
st.header("All Claims")
|
||||||
@@ -150,51 +138,33 @@ if view == "All Claims":
|
|||||||
st.markdown("---")
|
st.markdown("---")
|
||||||
|
|
||||||
# --------------------------
|
# --------------------------
|
||||||
# SINGLE CLAIM RANDOM VIEW
|
# View/Annotate
|
||||||
# --------------------------
|
# --------------------------
|
||||||
|
|
||||||
elif view == "Single Claim Random":
|
elif view == "Single Claim Random":
|
||||||
# Select new entry if needed
|
|
||||||
if st.session_state.current_claim is None:
|
if st.session_state.current_claim is None:
|
||||||
|
|
||||||
unscored_entries = []
|
unannotated = []
|
||||||
|
|
||||||
for entry in st.session_state.data:
|
for entry in st.session_state.data:
|
||||||
unscored = []
|
claims = []
|
||||||
|
|
||||||
for o in entry.get("output", []):
|
for o in entry.get("output", []):
|
||||||
for c in o.get("content_parsed", []):
|
for c in o.get("content_parsed", []):
|
||||||
if c.get("human_score") is None:
|
if not c.get("ranked"):
|
||||||
unscored.append(c)
|
claims.append(c)
|
||||||
|
|
||||||
if unscored:
|
if claims:
|
||||||
# try to find an existing entry with same documentUrl
|
unannotated.append({"entry": entry, "claims": claims})
|
||||||
existing = next(
|
|
||||||
(item for item in unscored_entries
|
|
||||||
if item["entry"]["documentUrl"] == entry["documentUrl"]),
|
|
||||||
None
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing:
|
if unannotated:
|
||||||
# append new claims to existing entry
|
st.session_state.current_claim = random.choice(unannotated)
|
||||||
existing["claims"].extend(unscored)
|
|
||||||
else:
|
|
||||||
# create new object
|
|
||||||
unscored_entries.append({
|
|
||||||
"entry": entry,
|
|
||||||
"claims": list(unscored)
|
|
||||||
})
|
|
||||||
|
|
||||||
if unscored_entries:
|
|
||||||
st.session_state.current_claim = random.choice(unscored_entries)
|
|
||||||
st.session_state.drag_order = None
|
st.session_state.drag_order = None
|
||||||
else:
|
|
||||||
st.session_state.current_claim = None
|
|
||||||
|
|
||||||
bundle = st.session_state.current_claim
|
bundle = st.session_state.current_claim
|
||||||
|
|
||||||
if bundle is None:
|
if bundle is None:
|
||||||
st.info("No entries remaining without human scores.")
|
st.info("All items annotated.")
|
||||||
else:
|
else:
|
||||||
entry = bundle["entry"]
|
entry = bundle["entry"]
|
||||||
claims = bundle["claims"]
|
claims = bundle["claims"]
|
||||||
@@ -202,144 +172,164 @@ elif view == "Single Claim Random":
|
|||||||
st.subheader(entry.get("text"))
|
st.subheader(entry.get("text"))
|
||||||
st.write(entry.get("normalized", ""))
|
st.write(entry.get("normalized", ""))
|
||||||
|
|
||||||
# --------------------------
|
|
||||||
# Stable Drag IDs (FIX)
|
|
||||||
# --------------------------
|
|
||||||
|
|
||||||
claim_ids = [str(i) for i in range(len(claims))]
|
|
||||||
|
|
||||||
# Initialize order only once
|
|
||||||
if (
|
|
||||||
st.session_state.drag_order is None
|
|
||||||
or len(st.session_state.drag_order) != len(claim_ids)
|
|
||||||
):
|
|
||||||
st.session_state.drag_order = claim_ids.copy()
|
|
||||||
|
|
||||||
ordered_indices = [
|
|
||||||
int(i) for i in st.session_state.drag_order
|
|
||||||
]
|
|
||||||
|
|
||||||
# --------------------------
|
|
||||||
# Annotation Section
|
|
||||||
# --------------------------
|
|
||||||
|
|
||||||
st.subheader("Annotate Events")
|
st.subheader("Annotate Events")
|
||||||
|
|
||||||
for pos, idx in enumerate(ordered_indices):
|
for idx, c in enumerate(claims):
|
||||||
c = claims[idx]
|
|
||||||
|
|
||||||
with st.container(border=True):
|
with st.container(border=True):
|
||||||
|
|
||||||
st.markdown(f"**Event:** {c.get('event')}")
|
st.markdown(f"**Event:** {c.get('event')}")
|
||||||
st.markdown(
|
st.markdown(f"**Reasoning:** {c.get('reasoningWhyRelevant')}")
|
||||||
f"**Reasoning:** {c.get('reasoningWhyRelevant')}"
|
|
||||||
)
|
|
||||||
|
|
||||||
cols = st.columns(7)
|
cols = st.columns(7)
|
||||||
|
|
||||||
temp = ""
|
temp = ""
|
||||||
|
|
||||||
with cols[0]:
|
labels = [
|
||||||
a = st.checkbox("Rewording", key = "R" + str(idx) + c.get('event') )
|
("Rewording", "REWORDING"),
|
||||||
temp += "REWORDING " if a else ""
|
("Not Specific", "NSPECIFIC"),
|
||||||
|
("Time Incorrect", "TINCORRECT"),
|
||||||
|
("Story?", "STORY"),
|
||||||
|
("Duplicate?", "DUPLICATE"),
|
||||||
|
("Bias Shown", "BIAS"),
|
||||||
|
("Perfect", "PERFECT"),
|
||||||
|
]
|
||||||
|
|
||||||
with cols[1]:
|
for i, (name, tag) in enumerate(labels):
|
||||||
a = st.checkbox("Not Specific", key = "S" + str(idx) + c.get('event') )
|
with cols[i]:
|
||||||
temp += "NSPECIFIC " if a else ""
|
if st.checkbox(name, key=f"{tag}{idx}{c.get('event')}"):
|
||||||
|
temp += tag + " "
|
||||||
|
|
||||||
with cols[2]:
|
c["extra_info"] = temp.strip()
|
||||||
a = st.checkbox("Time Incorrect", key = "T" + str(idx) + c.get('event') )
|
c["ranked"] = True
|
||||||
temp += "TINCORRECT " if a else ""
|
|
||||||
|
|
||||||
with cols[3]:
|
if st.button("Save Annotation"):
|
||||||
a = st.checkbox("Story?", key = "Y" + str(idx) + c.get('event') )
|
save_data(INPUT_FILE, st.session_state.data)
|
||||||
temp += "STORY " if a else ""
|
st.session_state.current_claim = None
|
||||||
|
print("Annotation saved")
|
||||||
with cols[4]:
|
|
||||||
a = st.checkbox("Duplicate?", key = "D" + str(idx) + c.get('event') )
|
|
||||||
temp += "DUPLICATE " if a else ""
|
|
||||||
|
|
||||||
with cols[5]:
|
|
||||||
a = st.checkbox("Bias Shown", key = "B" + str(idx) + c.get('event') )
|
|
||||||
temp += "BIAS " if a else ""
|
|
||||||
|
|
||||||
with cols[6]:
|
|
||||||
a = st.checkbox("Perfect", key = "P" + str(idx) + c.get('event') )
|
|
||||||
temp += "PERFECT " if a else ""
|
|
||||||
|
|
||||||
c["extra_info"] = temp
|
|
||||||
|
|
||||||
# ---- MOVE BUTTONS ----
|
|
||||||
move_cols = st.columns(2)
|
|
||||||
|
|
||||||
with move_cols[0]:
|
|
||||||
if st.button(
|
|
||||||
"Up",
|
|
||||||
key="UP" + str(idx) + c.get("event")
|
|
||||||
):
|
|
||||||
if pos > 0:
|
|
||||||
order = st.session_state.drag_order
|
|
||||||
order[pos], order[pos - 1] = order[pos - 1], order[pos]
|
|
||||||
st.session_state.drag_order = order
|
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
with move_cols[1]:
|
# --------------------------
|
||||||
if st.button(
|
# View/Rank
|
||||||
"Down",
|
# --------------------------
|
||||||
key="DOWN" + str(idx) + c.get("event")
|
|
||||||
):
|
|
||||||
if pos < len(st.session_state.drag_order) - 1:
|
|
||||||
order = st.session_state.drag_order
|
|
||||||
order[pos], order[pos + 1] = order[pos + 1], order[pos]
|
|
||||||
st.session_state.drag_order = order
|
|
||||||
st.rerun()
|
|
||||||
|
|
||||||
|
elif view == "Rank Perfect Events":
|
||||||
|
|
||||||
# --------------------------
|
st.header("Rank PERFECT Events")
|
||||||
# Submit Ranking
|
candidates = []
|
||||||
# --------------------------
|
|
||||||
|
|
||||||
if st.button("Submit Ranking"):
|
for entry in st.session_state.data:
|
||||||
|
perfect = []
|
||||||
|
|
||||||
n = len(ordered_indices)
|
for o in entry.get("output", []):
|
||||||
|
for c in o.get("content_parsed", []):
|
||||||
|
if "PERFECT" in c.get("extra_info", "") and not c.get("rank_position"):
|
||||||
|
perfect.append(c)
|
||||||
|
|
||||||
for rank_position, idx in enumerate(ordered_indices):
|
if perfect:
|
||||||
|
candidates.append({"entry": entry, "claims": perfect})
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
st.info("No PERFECT events available.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
if "current_bundle" not in st.session_state:
|
||||||
|
st.session_state.current_bundle = random.choice(candidates)
|
||||||
|
|
||||||
|
bundle = st.session_state.current_bundle
|
||||||
|
entry = bundle["entry"]
|
||||||
|
claims = bundle["claims"]
|
||||||
|
|
||||||
|
st.subheader(entry.get("text"))
|
||||||
|
|
||||||
|
# init
|
||||||
|
if "perfect_order" not in st.session_state:
|
||||||
|
st.session_state.perfect_order = list(range(len(claims)))
|
||||||
|
|
||||||
|
order = st.session_state.perfect_order
|
||||||
|
|
||||||
|
# labels shown in sortable UI
|
||||||
|
labels = [
|
||||||
|
f"{i+1}. {claims[idx].get('event')}"
|
||||||
|
for i, idx in enumerate(order)
|
||||||
|
]
|
||||||
|
|
||||||
|
st.markdown("### Drag to reorder:")
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Drag & drop UI
|
||||||
|
# -------------------------
|
||||||
|
new_labels = sort_items(labels)
|
||||||
|
|
||||||
|
# Convert reordered labels back → indices
|
||||||
|
if new_labels != labels:
|
||||||
|
new_order = []
|
||||||
|
for lbl in new_labels:
|
||||||
|
original_pos = labels.index(lbl)
|
||||||
|
new_order.append(order[original_pos])
|
||||||
|
|
||||||
|
st.session_state.perfect_order = new_order
|
||||||
|
order = new_order
|
||||||
|
|
||||||
|
st.markdown("---")
|
||||||
|
for rank, idx in enumerate(order):
|
||||||
|
c = claims[idx]
|
||||||
|
st.markdown(f"**Rank {rank+1}: {c.get('event')}**")
|
||||||
|
st.markdown(c.get("reasoningWhyRelevant"))
|
||||||
|
st.markdown("---")
|
||||||
|
|
||||||
|
if st.button("Submit PERFECT Ranking"):
|
||||||
|
|
||||||
|
n = len(order)
|
||||||
|
|
||||||
|
for rank_position, idx in enumerate(order):
|
||||||
claim_obj = claims[idx]
|
claim_obj = claims[idx]
|
||||||
score = 0
|
|
||||||
if n == 1:
|
|
||||||
score = 1.0
|
|
||||||
else:
|
|
||||||
score = 1 - (rank_position / (n - 1))
|
|
||||||
|
|
||||||
if (claim_obj["extra_info"] != ""):
|
# explicit stored rank
|
||||||
if (claim_obj["extra_info"].find("PERFECT") != -1):
|
claim_obj["rank_position"] = rank_position + 1
|
||||||
score = 1
|
|
||||||
elif(claim_obj["extra_info"].find("DUPLICATE") != -1):
|
|
||||||
score = 0
|
|
||||||
else:
|
|
||||||
score *= 0.5
|
|
||||||
|
|
||||||
|
claim_obj["human_score"] = 1
|
||||||
|
|
||||||
claim_obj["human_score"] = round(score, 3)
|
# Auto-scoring
|
||||||
|
for entry in st.session_state.data:
|
||||||
|
for o in entry.get("output", []):
|
||||||
|
for c in o.get("content_parsed", []):
|
||||||
|
|
||||||
|
if c.get("human_score") is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
extra = c.get("extra_info", "")
|
||||||
|
|
||||||
|
if "DUPLICATE" in extra:
|
||||||
|
c["human_score"] = 0
|
||||||
|
elif extra:
|
||||||
|
c["human_score"] = round(
|
||||||
|
c.get("score", 0) * 0.5, 3
|
||||||
|
)
|
||||||
|
|
||||||
save_data(INPUT_FILE, st.session_state.data)
|
save_data(INPUT_FILE, st.session_state.data)
|
||||||
save_data_clean(OUTPUT_FILE, copy.deepcopy(st.session_state.data))
|
save_data_clean(
|
||||||
|
OUTPUT_FILE,
|
||||||
|
copy.deepcopy(st.session_state.data)
|
||||||
|
)
|
||||||
|
|
||||||
|
# reset state for next example
|
||||||
|
del st.session_state.current_bundle
|
||||||
|
del st.session_state.perfect_order
|
||||||
|
|
||||||
print("Ranking converted to scores and saved!")
|
print("Ranking saved!")
|
||||||
|
|
||||||
st.session_state.current_claim = None
|
|
||||||
st.session_state.drag_order = None
|
|
||||||
|
|
||||||
st.rerun()
|
st.rerun()
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# View/Rules
|
||||||
|
# --------------------------
|
||||||
elif view == "View Rules":
|
elif view == "View Rules":
|
||||||
with open("rules.txt", "r", encoding="utf-8") as f:
|
with open("rules.txt", "r", encoding="utf-8") as f:
|
||||||
st.write(f.read())
|
st.write(f.read())
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# View/Statistics
|
||||||
|
# --------------------------
|
||||||
elif view == "Statistics":
|
elif view == "Statistics":
|
||||||
|
|
||||||
st.header("Statistics")
|
st.header("Statistics")
|
||||||
|
|
||||||
word_counter = Counter()
|
word_counter = Counter()
|
||||||
@@ -359,20 +349,9 @@ elif view == "Statistics":
|
|||||||
words = extra.strip().split()
|
words = extra.strip().split()
|
||||||
word_counter.update(words)
|
word_counter.update(words)
|
||||||
|
|
||||||
# ---- human score aggregation ----
|
# --------------------------
|
||||||
hs = c.get("human_score")
|
|
||||||
if hs is not None and doc_url:
|
|
||||||
doc_scores[doc_url].append(hs)
|
|
||||||
|
|
||||||
# ---- diff score aggregation ----
|
|
||||||
s = c.get("score")
|
|
||||||
if hs is not None and s is not None and doc_url:
|
|
||||||
diff = abs(hs - s)
|
|
||||||
diff_scores[doc_url].append(diff)
|
|
||||||
|
|
||||||
# ==========================
|
|
||||||
# Extra Info Word Counts
|
# Extra Info Word Counts
|
||||||
# ==========================
|
# --------------------------
|
||||||
st.subheader("Extra Info Label Counts")
|
st.subheader("Extra Info Label Counts")
|
||||||
|
|
||||||
if word_counter:
|
if word_counter:
|
||||||
@@ -385,86 +364,3 @@ elif view == "Statistics":
|
|||||||
st.bar_chart(df_words.set_index("Label"))
|
st.bar_chart(df_words.set_index("Label"))
|
||||||
else:
|
else:
|
||||||
st.info("No extra_info data available yet.")
|
st.info("No extra_info data available yet.")
|
||||||
|
|
||||||
# ==========================
|
|
||||||
# Avg Human Score per Document
|
|
||||||
# ==========================
|
|
||||||
st.subheader("Average Human Score per documentUrl")
|
|
||||||
|
|
||||||
avg_scores = []
|
|
||||||
|
|
||||||
for doc, scores in doc_scores.items():
|
|
||||||
if scores:
|
|
||||||
avg_scores.append({
|
|
||||||
"documentUrl": doc,
|
|
||||||
"avg_human_score": sum(scores) / len(scores),
|
|
||||||
"num_events": len(scores)
|
|
||||||
})
|
|
||||||
|
|
||||||
if avg_scores:
|
|
||||||
df_scores = pd.DataFrame(avg_scores).sort_values(
|
|
||||||
"avg_human_score",
|
|
||||||
ascending=False
|
|
||||||
)
|
|
||||||
|
|
||||||
st.dataframe(df_scores)
|
|
||||||
# ==========================
|
|
||||||
# Distribution (rounded to 0.1)
|
|
||||||
# ==========================
|
|
||||||
|
|
||||||
st.subheader("Distribution of Average Human Scores (Rounded to 0.1)")
|
|
||||||
|
|
||||||
# round averages to nearest 0.1
|
|
||||||
df_scores["rounded_score"] = (
|
|
||||||
df_scores["avg_human_score"].round(1)
|
|
||||||
)
|
|
||||||
|
|
||||||
# count how many docs fall into each bucket
|
|
||||||
dist = (
|
|
||||||
df_scores["rounded_score"]
|
|
||||||
.value_counts()
|
|
||||||
.sort_index()
|
|
||||||
.reset_index()
|
|
||||||
)
|
|
||||||
|
|
||||||
dist.columns = ["rounded_score", "count"]
|
|
||||||
|
|
||||||
# ensure all bins from 0.0 → 1.0 exist
|
|
||||||
all_bins = pd.DataFrame({
|
|
||||||
"rounded_score": [round(x * 0.1, 1) for x in range(11)]
|
|
||||||
})
|
|
||||||
|
|
||||||
dist = (
|
|
||||||
all_bins.merge(dist, on="rounded_score", how="left")
|
|
||||||
.fillna(0)
|
|
||||||
)
|
|
||||||
|
|
||||||
dist["count"] = dist["count"].astype(int)
|
|
||||||
|
|
||||||
# plot counts per score bucket
|
|
||||||
st.bar_chart(
|
|
||||||
dist.set_index("rounded_score")["count"]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
st.info("No human scores available yet.")
|
|
||||||
|
|
||||||
# ==========================
|
|
||||||
# Overall Model vs Human Difference
|
|
||||||
# ==========================
|
|
||||||
st.subheader("Model vs Human Agreement")
|
|
||||||
|
|
||||||
all_diffs = [
|
|
||||||
diff
|
|
||||||
for diffs in diff_scores.values()
|
|
||||||
for diff in diffs
|
|
||||||
]
|
|
||||||
|
|
||||||
if all_diffs:
|
|
||||||
avg_diff = sum(all_diffs) / len(all_diffs)
|
|
||||||
|
|
||||||
st.write(
|
|
||||||
f"Average absolute difference between model score and human score: "
|
|
||||||
f"**{avg_diff:.3f}**"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
st.info("No items have both score and human_score yet.")
|
|
||||||
@@ -1 +1,2 @@
|
|||||||
streamlit
|
streamlit
|
||||||
|
streamlit-sortables
|
||||||
@@ -15,3 +15,6 @@
|
|||||||
5. Proposed trigger events should be sufficiently different from one another
|
5. Proposed trigger events should be sufficiently different from one another
|
||||||
|
|
||||||
6. Proposed trigger events must be free from bias, and backed up by reliable evidence
|
6. Proposed trigger events must be free from bias, and backed up by reliable evidence
|
||||||
|
|
||||||
|
Edge case handing:
|
||||||
|
In the event analysis is perfect, however contains section(s) that violate one of the above, a worst case approach should be taken and the analysis should be laballed negativley
|
||||||
Reference in New Issue
Block a user