Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.

This commit is contained in:
William Jeynes
2026-02-27 14:41:10 +00:00
parent 201176e71c
commit c94812ed80
3 changed files with 51 additions and 5 deletions
+1 -1
View File
@@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json";
const OUTPUT_FILE = "../../data/results.jsonl"; const OUTPUT_FILE = "../../data/results.jsonl";
const API_URL = "http://localhost:2024"; const API_URL = "http://localhost:2024";
const AGENT_NAME = "agent"; const AGENT_NAME = "agent";
const MAX_CONCURRENCY = 50; const MAX_CONCURRENCY = 5;
const client = new Client({ apiUrl: API_URL }); const client = new Client({ apiUrl: API_URL });
+48 -2
View File
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
NUM_RANDOM_CLAIMS = 20 NUM_RANDOM_CLAIMS = 20
INPUT_FILE = "../../data/input.jsonl"
OUTPUT_FILE = "../../data/claims.json" OUTPUT_FILE = "../../data/claims.json"
def load_existing_urls(input_file):
existing_urls = set()
try:
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
url = obj.get("documentUrl")
if url:
existing_urls.add(url)
print(f"Loaded {len(existing_urls)} existing document URLs.")
except FileNotFoundError:
print(f"No existing file found at {input_file}")
except Exception as e:
print(f"Error reading JSONL file: {e}")
return existing_urls
def fetch_claims(params=None): def fetch_claims(params=None):
if params is None: if params is None:
params = DEFAULT_PARAMS params = DEFAULT_PARAMS
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
print(f"Error fetching data: {e}") print(f"Error fetching data: {e}")
return [] return []
def save_random_claims(documents, output_file, num_claims=20): def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
if not documents: if not documents:
print("No documents to save.") print("No documents to save.")
return return
excluded_urls = excluded_urls or set()
# remove already-used documents
filtered_docs = [
d for d in documents
if d.get("documentUrl") not in excluded_urls
]
print(f"{len(filtered_docs)} documents remain after filtering.")
if not filtered_docs:
print("No new documents available after filtering.")
return
sample_size = min(num_claims, len(documents)) sample_size = min(num_claims, len(documents))
selected = random.sample(documents, sample_size) selected = random.sample(documents, sample_size)
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
print(f"Saved {sample_size} random claims to {output_file}") print(f"Saved {sample_size} random claims to {output_file}")
if __name__ == "__main__": if __name__ == "__main__":
existing_urls = load_existing_urls(INPUT_FILE)
docs = fetch_claims() docs = fetch_claims()
save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
save_random_claims(
docs,
OUTPUT_FILE,
excluded_urls=existing_urls,
num_claims=NUM_RANDOM_CLAIMS
)
+2 -2
View File
@@ -10,7 +10,7 @@ def page_title() -> str:
return "Rank" return "Rank"
def render(): def render():
st.header("Rank PERFECT Events") st.header("Rank Events")
candidates = [] candidates = []
for entry in st.session_state.data: for entry in st.session_state.data:
@@ -25,7 +25,7 @@ def render():
candidates.append({"entry": entry, "claims": perfect}) candidates.append({"entry": entry, "claims": perfect})
if not candidates: if not candidates:
st.info("No PERFECT events available.") st.info("No events available.")
st.stop() st.stop()
if "current_bundle" not in st.session_state: if "current_bundle" not in st.session_state: