diff --git a/supporting/Wrapper/run.ts b/supporting/Wrapper/run.ts index 1987390..a83f08a 100644 --- a/supporting/Wrapper/run.ts +++ b/supporting/Wrapper/run.ts @@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json"; const OUTPUT_FILE = "../../data/results.jsonl"; const API_URL = "http://localhost:2024"; const AGENT_NAME = "agent"; -const MAX_CONCURRENCY = 50; +const MAX_CONCURRENCY = 5; const client = new Client({ apiUrl: API_URL }); diff --git a/supporting/dbkf/fetch.py b/supporting/dbkf/fetch.py index 531622c..e8eb62d 100644 --- a/supporting/dbkf/fetch.py +++ b/supporting/dbkf/fetch.py @@ -26,8 +26,32 @@ DEFAULT_PARAMS = [ NUM_RANDOM_CLAIMS = 20 +INPUT_FILE = "../../data/input.jsonl" OUTPUT_FILE = "../../data/claims.json" +def load_existing_urls(input_file): + existing_urls = set() + + try: + with open(input_file, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + obj = json.loads(line) + url = obj.get("documentUrl") + if url: + existing_urls.add(url) + + print(f"Loaded {len(existing_urls)} existing document URLs.") + except FileNotFoundError: + print(f"No existing file found at {input_file}") + except Exception as e: + print(f"Error reading JSONL file: {e}") + + return existing_urls + + def fetch_claims(params=None): if params is None: params = DEFAULT_PARAMS @@ -47,11 +71,25 @@ def fetch_claims(params=None): print(f"Error fetching data: {e}") return [] -def save_random_claims(documents, output_file, num_claims=20): +def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20): if not documents: print("No documents to save.") return + excluded_urls = excluded_urls or set() + + # remove already-used documents + filtered_docs = [ + d for d in documents + if d.get("documentUrl") not in excluded_urls + ] + + print(f"{len(filtered_docs)} documents remain after filtering.") + + if not filtered_docs: + print("No new documents available after filtering.") + return + sample_size = min(num_claims, len(documents)) selected = random.sample(documents, sample_size) @@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20): print(f"Saved {sample_size} random claims to {output_file}") if __name__ == "__main__": + existing_urls = load_existing_urls(INPUT_FILE) + docs = fetch_claims() - save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS) + + save_random_claims( + docs, + OUTPUT_FILE, + excluded_urls=existing_urls, + num_claims=NUM_RANDOM_CLAIMS + ) diff --git a/supporting/scorer/views/rank.py b/supporting/scorer/views/rank.py index 43a0e1e..7235f73 100644 --- a/supporting/scorer/views/rank.py +++ b/supporting/scorer/views/rank.py @@ -10,7 +10,7 @@ def page_title() -> str: return "Rank" def render(): - st.header("Rank PERFECT Events") + st.header("Rank Events") candidates = [] for entry in st.session_state.data: @@ -25,7 +25,7 @@ def render(): candidates.append({"entry": entry, "claims": perfect}) if not candidates: - st.info("No PERFECT events available.") + st.info("No events available.") st.stop() if "current_bundle" not in st.session_state: