Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.

This commit is contained in:
William Jeynes
2026-02-27 14:41:10 +00:00
parent 201176e71c
commit c94812ed80
3 changed files with 51 additions and 5 deletions
+1 -1
View File
@@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json";
const OUTPUT_FILE = "../../data/results.jsonl";
const API_URL = "http://localhost:2024";
const AGENT_NAME = "agent";
const MAX_CONCURRENCY = 50;
const MAX_CONCURRENCY = 5;
const client = new Client({ apiUrl: API_URL });
+48 -2
View File
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
NUM_RANDOM_CLAIMS = 20
INPUT_FILE = "../../data/input.jsonl"
OUTPUT_FILE = "../../data/claims.json"
def load_existing_urls(input_file):
existing_urls = set()
try:
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
url = obj.get("documentUrl")
if url:
existing_urls.add(url)
print(f"Loaded {len(existing_urls)} existing document URLs.")
except FileNotFoundError:
print(f"No existing file found at {input_file}")
except Exception as e:
print(f"Error reading JSONL file: {e}")
return existing_urls
def fetch_claims(params=None):
if params is None:
params = DEFAULT_PARAMS
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
print(f"Error fetching data: {e}")
return []
def save_random_claims(documents, output_file, num_claims=20):
def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
if not documents:
print("No documents to save.")
return
excluded_urls = excluded_urls or set()
# remove already-used documents
filtered_docs = [
d for d in documents
if d.get("documentUrl") not in excluded_urls
]
print(f"{len(filtered_docs)} documents remain after filtering.")
if not filtered_docs:
print("No new documents available after filtering.")
return
sample_size = min(num_claims, len(documents))
selected = random.sample(documents, sample_size)
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
print(f"Saved {sample_size} random claims to {output_file}")
if __name__ == "__main__":
existing_urls = load_existing_urls(INPUT_FILE)
docs = fetch_claims()
save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
save_random_claims(
docs,
OUTPUT_FILE,
excluded_urls=existing_urls,
num_claims=NUM_RANDOM_CLAIMS
)
+2 -2
View File
@@ -10,7 +10,7 @@ def page_title() -> str:
return "Rank"
def render():
st.header("Rank PERFECT Events")
st.header("Rank Events")
candidates = []
for entry in st.session_state.data:
@@ -25,7 +25,7 @@ def render():
candidates.append({"entry": entry, "claims": perfect})
if not candidates:
st.info("No PERFECT events available.")
st.info("No events available.")
st.stop()
if "current_bundle" not in st.session_state: