Prepare for mass data collection. Reduce concurrency as to not overwhelm scraper on long sessions. Remode duplicates from fetch script. Removing naming wierdness on scorer frontend.
This commit is contained in:
@@ -9,7 +9,7 @@ const INPUT_FILE = "../../data/claims.json";
|
|||||||
const OUTPUT_FILE = "../../data/results.jsonl";
|
const OUTPUT_FILE = "../../data/results.jsonl";
|
||||||
const API_URL = "http://localhost:2024";
|
const API_URL = "http://localhost:2024";
|
||||||
const AGENT_NAME = "agent";
|
const AGENT_NAME = "agent";
|
||||||
const MAX_CONCURRENCY = 50;
|
const MAX_CONCURRENCY = 5;
|
||||||
|
|
||||||
const client = new Client({ apiUrl: API_URL });
|
const client = new Client({ apiUrl: API_URL });
|
||||||
|
|
||||||
|
|||||||
@@ -26,8 +26,32 @@ DEFAULT_PARAMS = [
|
|||||||
|
|
||||||
NUM_RANDOM_CLAIMS = 20
|
NUM_RANDOM_CLAIMS = 20
|
||||||
|
|
||||||
|
INPUT_FILE = "../../data/input.jsonl"
|
||||||
OUTPUT_FILE = "../../data/claims.json"
|
OUTPUT_FILE = "../../data/claims.json"
|
||||||
|
|
||||||
|
def load_existing_urls(input_file):
|
||||||
|
existing_urls = set()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(input_file, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
obj = json.loads(line)
|
||||||
|
url = obj.get("documentUrl")
|
||||||
|
if url:
|
||||||
|
existing_urls.add(url)
|
||||||
|
|
||||||
|
print(f"Loaded {len(existing_urls)} existing document URLs.")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"No existing file found at {input_file}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading JSONL file: {e}")
|
||||||
|
|
||||||
|
return existing_urls
|
||||||
|
|
||||||
|
|
||||||
def fetch_claims(params=None):
|
def fetch_claims(params=None):
|
||||||
if params is None:
|
if params is None:
|
||||||
params = DEFAULT_PARAMS
|
params = DEFAULT_PARAMS
|
||||||
@@ -47,11 +71,25 @@ def fetch_claims(params=None):
|
|||||||
print(f"Error fetching data: {e}")
|
print(f"Error fetching data: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def save_random_claims(documents, output_file, num_claims=20):
|
def save_random_claims(documents, output_file, excluded_urls=None, num_claims=20):
|
||||||
if not documents:
|
if not documents:
|
||||||
print("No documents to save.")
|
print("No documents to save.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
excluded_urls = excluded_urls or set()
|
||||||
|
|
||||||
|
# remove already-used documents
|
||||||
|
filtered_docs = [
|
||||||
|
d for d in documents
|
||||||
|
if d.get("documentUrl") not in excluded_urls
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"{len(filtered_docs)} documents remain after filtering.")
|
||||||
|
|
||||||
|
if not filtered_docs:
|
||||||
|
print("No new documents available after filtering.")
|
||||||
|
return
|
||||||
|
|
||||||
sample_size = min(num_claims, len(documents))
|
sample_size = min(num_claims, len(documents))
|
||||||
selected = random.sample(documents, sample_size)
|
selected = random.sample(documents, sample_size)
|
||||||
|
|
||||||
@@ -61,5 +99,13 @@ def save_random_claims(documents, output_file, num_claims=20):
|
|||||||
print(f"Saved {sample_size} random claims to {output_file}")
|
print(f"Saved {sample_size} random claims to {output_file}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
existing_urls = load_existing_urls(INPUT_FILE)
|
||||||
|
|
||||||
docs = fetch_claims()
|
docs = fetch_claims()
|
||||||
save_random_claims(docs, OUTPUT_FILE, NUM_RANDOM_CLAIMS)
|
|
||||||
|
save_random_claims(
|
||||||
|
docs,
|
||||||
|
OUTPUT_FILE,
|
||||||
|
excluded_urls=existing_urls,
|
||||||
|
num_claims=NUM_RANDOM_CLAIMS
|
||||||
|
)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ def page_title() -> str:
|
|||||||
return "Rank"
|
return "Rank"
|
||||||
|
|
||||||
def render():
|
def render():
|
||||||
st.header("Rank PERFECT Events")
|
st.header("Rank Events")
|
||||||
candidates = []
|
candidates = []
|
||||||
|
|
||||||
for entry in st.session_state.data:
|
for entry in st.session_state.data:
|
||||||
@@ -25,7 +25,7 @@ def render():
|
|||||||
candidates.append({"entry": entry, "claims": perfect})
|
candidates.append({"entry": entry, "claims": perfect})
|
||||||
|
|
||||||
if not candidates:
|
if not candidates:
|
||||||
st.info("No PERFECT events available.")
|
st.info("No events available.")
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
if "current_bundle" not in st.session_state:
|
if "current_bundle" not in st.session_state:
|
||||||
|
|||||||
Reference in New Issue
Block a user