Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.
This commit is contained in:
@@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json";
|
||||
const OUTPUT_FILE = "../../data/results.jsonl";
|
||||
const API_URL = "http://localhost:2024";
|
||||
const AGENT_NAME = "agent";
|
||||
const MAX_CONCURRENCY = 50;
|
||||
const MAX_CONCURRENCY = 5;
|
||||
|
||||
const client = new Client({ apiUrl: API_URL });
|
||||
|
||||
|
||||
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
|
||||
|
||||
NUM_RANDOM_CLAIMS = 20
|
||||
|
||||
INPUT_FILE = "../../data/input.jsonl"
|
||||
OUTPUT_FILE = "../../data/claims.json"
|
||||
|
||||
def load_existing_urls(input_file):
|
||||
existing_urls = set()
|
||||
|
||||
try:
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
url = obj.get("documentUrl")
|
||||
if url:
|
||||
existing_urls.add(url)
|
||||
|
||||
print(f"Loaded {len(existing_urls)} existing document URLs.")
|
||||
except FileNotFoundError:
|
||||
print(f"No existing file found at {input_file}")
|
||||
except Exception as e:
|
||||
print(f"Error reading JSONL file: {e}")
|
||||
|
||||
return existing_urls
|
||||
|
||||
|
||||
def fetch_claims(params=None):
|
||||
if params is None:
|
||||
params = DEFAULT_PARAMS
|
||||
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
|
||||
print(f"Error fetching data: {e}")
|
||||
return []
|
||||
|
||||
def save_random_claims(documents, output_file, num_claims=20):
|
||||
def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
|
||||
if not documents:
|
||||
print("No documents to save.")
|
||||
return
|
||||
|
||||
excluded_urls = excluded_urls or set()
|
||||
|
||||
# remove already-used documents
|
||||
filtered_docs = [
|
||||
d for d in documents
|
||||
if d.get("documentUrl") not in excluded_urls
|
||||
]
|
||||
|
||||
print(f"{len(filtered_docs)} documents remain after filtering.")
|
||||
|
||||
if not filtered_docs:
|
||||
print("No new documents available after filtering.")
|
||||
return
|
||||
|
||||
sample_size = min(num_claims, len(documents))
|
||||
selected = random.sample(documents, sample_size)
|
||||
|
||||
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
|
||||
print(f"Saved {sample_size} random claims to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
existing_urls = load_existing_urls(INPUT_FILE)
|
||||
|
||||
docs = fetch_claims()
|
||||
save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
|
||||
|
||||
save_random_claims(
|
||||
docs,
|
||||
OUTPUT_FILE,
|
||||
excluded_urls=existing_urls,
|
||||
num_claims=NUM_RANDOM_CLAIMS
|
||||
)
|
||||
|
||||
@@ -10,7 +10,7 @@ def page_title() -> str:
|
||||
return "Rank"
|
||||
|
||||
def render():
|
||||
st.header("Rank PERFECT Events")
|
||||
st.header("Rank Events")
|
||||
candidates = []
|
||||
|
||||
for entry in st.session_state.data:
|
||||
@@ -25,7 +25,7 @@ def render():
|
||||
candidates.append({"entry": entry, "claims": perfect})
|
||||
|
||||
if not candidates:
|
||||
st.info("No PERFECT events available.")
|
||||
st.info("No events available.")
|
||||
st.stop()
|
||||
|
||||
if "current_bundle" not in st.session_state:
|
||||
|
||||
Reference in New Issue
Block a user