Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.

2026-02-27 14:41:10 +00:00
parent 201176e71c
commit c94812ed80
3 changed files with 51 additions and 5 deletions
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [

 NUM_RANDOM_CLAIMS = 20

+INPUT_FILE = "../../data/input.jsonl"
 OUTPUT_FILE = "../../data/claims.json"

+def load_existing_urls(input_file):
+    existing_urls = set()
+
+    try:
+        with open(input_file, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                obj = json.loads(line)
+                url = obj.get("documentUrl")
+                if url:
+                    existing_urls.add(url)
+
+        print(f"Loaded {len(existing_urls)} existing document URLs.")
+    except FileNotFoundError:
+        print(f"No existing file found at {input_file}")
+    except Exception as e:
+        print(f"Error reading JSONL file: {e}")
+
+    return existing_urls
+
+
 def fetch_claims(params=None):
    if params is None:
        params = DEFAULT_PARAMS
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
        print(f"Error fetching data: {e}")
        return []

-def save_random_claims(documents, output_file, num_claims=20):
+def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
    if not documents:
        print("No documents to save.")
        return

+    excluded_urls = excluded_urls or set()
+
+    # remove already-used documents
+    filtered_docs = [
+        d for d in documents
+        if d.get("documentUrl") not in excluded_urls
+    ]
+
+    print(f"{len(filtered_docs)} documents remain after filtering.")
+
+    if not filtered_docs:
+        print("No new documents available after filtering.")
+        return
+
    sample_size = min(num_claims, len(documents))
    selected = random.sample(documents, sample_size)

@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
    print(f"Saved {sample_size} random claims to {output_file}")

 if __name__ == "__main__":
+    existing_urls = load_existing_urls(INPUT_FILE)
+
    docs = fetch_claims()
-    save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
+
+    save_random_claims(
+        docs,
+        OUTPUT_FILE,
+        excluded_urls=existing_urls,
+        num_claims=NUM_RANDOM_CLAIMS
+    )