Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.

2026-02-27 14:41:10 +00:00
parent 201176e71c
commit c94812ed80
3 changed files with 51 additions and 5 deletions
@@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json";
 const OUTPUT_FILE = "../../data/results.jsonl";
 const API_URL = "http://localhost:2024";
 const AGENT_NAME = "agent";
-const MAX_CONCURRENCY = 50;
+const MAX_CONCURRENCY = 5;
 const client = new Client({ apiUrl: API_URL });
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
 NUM_RANDOM_CLAIMS = 20
 INPUT_FILE = "../../data/input.jsonl"
 OUTPUT_FILE = "../../data/claims.json"
 def load_existing_urls(input_file):
    existing_urls = set()
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                obj = json.loads(line)
                url = obj.get("documentUrl")
                if url:
                    existing_urls.add(url)
        print(f"Loaded {len(existing_urls)} existing document URLs.")
    except FileNotFoundError:
        print(f"No existing file found at {input_file}")
    except Exception as e:
        print(f"Error reading JSONL file: {e}")
    return existing_urls
 def fetch_claims(params=None):
    if params is None:
        params = DEFAULT_PARAMS
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
        print(f"Error fetching data: {e}")
        return []
-def save_random_claims(documents, output_file, num_claims=20):
+def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
    if not documents:
        print("No documents to save.")
        return
    excluded_urls = excluded_urls or set()
    # remove already-used documents
    filtered_docs = [
        d for d in documents
        if d.get("documentUrl") not in excluded_urls
    ]
    print(f"{len(filtered_docs)} documents remain after filtering.")
    if not filtered_docs:
        print("No new documents available after filtering.")
        return
    sample_size = min(num_claims, len(documents))
    selected = random.sample(documents, sample_size)
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
    print(f"Saved {sample_size} random claims to {output_file}")
 if __name__ == "__main__":
    existing_urls = load_existing_urls(INPUT_FILE)
    docs = fetch_claims()
-    save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
+
    save_random_claims(
        docs,
        OUTPUT_FILE,
        excluded_urls=existing_urls,
        num_claims=NUM_RANDOM_CLAIMS
    )
@@ -10,7 +10,7 @@ def page_title() -> str:
    return "Rank"
 def render():
-    st.header("Rank PERFECT Events")
+    st.header("Rank Events")
    candidates = []
    for entry in st.session_state.data:
@@ -25,7 +25,7 @@ def render():
            candidates.append({"entry": entry, "claims": perfect})
    if not candidates:
-        st.info("No PERFECT events available.")
+        st.info("No events available.")
        st.stop()
    if "current_bundle" not in st.session_state: