Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.
This commit is contained in:
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
|
||||
|
||||
NUM_RANDOM_CLAIMS = 20
|
||||
|
||||
INPUT_FILE = "../../data/input.jsonl"
|
||||
OUTPUT_FILE = "../../data/claims.json"
|
||||
|
||||
def load_existing_urls(input_file):
|
||||
existing_urls = set()
|
||||
|
||||
try:
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
url = obj.get("documentUrl")
|
||||
if url:
|
||||
existing_urls.add(url)
|
||||
|
||||
print(f"Loaded {len(existing_urls)} existing document URLs.")
|
||||
except FileNotFoundError:
|
||||
print(f"No existing file found at {input_file}")
|
||||
except Exception as e:
|
||||
print(f"Error reading JSONL file: {e}")
|
||||
|
||||
return existing_urls
|
||||
|
||||
|
||||
def fetch_claims(params=None):
|
||||
if params is None:
|
||||
params = DEFAULT_PARAMS
|
||||
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
|
||||
print(f"Error fetching data: {e}")
|
||||
return []
|
||||
|
||||
def save_random_claims(documents, output_file, num_claims=20):
|
||||
def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
|
||||
if not documents:
|
||||
print("No documents to save.")
|
||||
return
|
||||
|
||||
excluded_urls = excluded_urls or set()
|
||||
|
||||
# remove already-used documents
|
||||
filtered_docs = [
|
||||
d for d in documents
|
||||
if d.get("documentUrl") not in excluded_urls
|
||||
]
|
||||
|
||||
print(f"{len(filtered_docs)} documents remain after filtering.")
|
||||
|
||||
if not filtered_docs:
|
||||
print("No new documents available after filtering.")
|
||||
return
|
||||
|
||||
sample_size = min(num_claims, len(documents))
|
||||
selected = random.sample(documents, sample_size)
|
||||
|
||||
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
|
||||
print(f"Saved {sample_size} random claims to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
existing_urls = load_existing_urls(INPUT_FILE)
|
||||
|
||||
docs = fetch_claims()
|
||||
save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
|
||||
|
||||
save_random_claims(
|
||||
docs,
|
||||
OUTPUT_FILE,
|
||||
excluded_urls=existing_urls,
|
||||
num_claims=NUM_RANDOM_CLAIMS
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user