start on work to calculate % if valid URLS
This commit is contained in:
+20
-1
@@ -1,3 +1,22 @@
|
||||
## Refining the agent output
|
||||
|
||||
TODO: Table and document experiments
|
||||
Experiments modifying pipeline
|
||||
|
||||
| Model | % Correct | % Change |
|
||||
|------------------|----------:|---------:|
|
||||
| BASELINE | 33 | 0 |
|
||||
| Improv Prompt | 39.96 | 0.21 |
|
||||
| Add Examples | 44.67 | 0.35 |
|
||||
| Date | 45.51 | 0.38 |
|
||||
| Chain of Thought | 43.38 | 0.31 |
|
||||
| Self-Critique | 44.36 | 0.34 |
|
||||
|
||||
Experiments with different model types:
|
||||
| Model | % Correct | % Change |
|
||||
|-------------------------------|----------:|---------:|
|
||||
| gpt-5-mini | 33 | 0 |
|
||||
| gpt-5.4-mini | 32.4 | -0.02 |
|
||||
| llama3.1:8b-instruct-q4_K_M | ? | ? |
|
||||
| qwen3.5:9b | 0 | -100 |
|
||||
|
||||
%age correct URLS
|
||||
@@ -0,0 +1,104 @@
|
||||
import json
|
||||
import argparse
|
||||
from urllib.parse import urlparse
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import WebDriverException, TimeoutException
|
||||
|
||||
def init_driver():
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_argument("--window-size=1920,1080")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.set_page_load_timeout(15)
|
||||
return driver
|
||||
|
||||
def is_root_url(url):
|
||||
"""Return True if URL is a root domain with no path."""
|
||||
parsed = urlparse(url)
|
||||
return parsed.path in ("", "/")
|
||||
|
||||
def is_404_page(driver):
|
||||
"""Check for 404 indicators in title or body text."""
|
||||
title = driver.title.lower()
|
||||
body_text = driver.find_element("tag name", "body").text.lower()
|
||||
if "404" in title or "not found" in title:
|
||||
return True
|
||||
if "404" in body_text or "not found" in body_text:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_url_selenium(driver, url):
|
||||
"""
|
||||
Check if a URL is valid:
|
||||
- Loads without error
|
||||
- Not a 404 page
|
||||
- Not redirected to a root domain
|
||||
"""
|
||||
try:
|
||||
driver.get(url)
|
||||
# 1️⃣ Check if page is 404
|
||||
if is_404_page(driver):
|
||||
print(f"Page returned 404: {url}")
|
||||
return False
|
||||
# 2️⃣ Check if current URL after redirects is a root URL
|
||||
final_url = driver.current_url
|
||||
if is_root_url(final_url):
|
||||
print(f"Redirected to root URL (invalid): {final_url}")
|
||||
return False
|
||||
return True
|
||||
except (WebDriverException, TimeoutException) as e:
|
||||
print(f"Error accessing URL {url}: {e}")
|
||||
return False
|
||||
|
||||
def process_event(event, driver, invalid_urls):
|
||||
url = event.get("Url")
|
||||
if url:
|
||||
is_valid = check_url_selenium(driver, url)
|
||||
event["url_valid"] = is_valid
|
||||
if not is_valid:
|
||||
invalid_urls.append(url)
|
||||
return is_valid
|
||||
return False
|
||||
|
||||
def process_jsonl_file(file_path):
|
||||
total_urls = 0
|
||||
valid_urls = 0
|
||||
invalid_urls = []
|
||||
|
||||
driver = init_driver()
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line_data = json.loads(line)
|
||||
|
||||
if line_data.get("status") != "success":
|
||||
continue
|
||||
|
||||
events = line_data.get("events", [])
|
||||
for event in events:
|
||||
total_urls += 1
|
||||
if process_event(event, driver, invalid_urls):
|
||||
valid_urls += 1
|
||||
|
||||
driver.quit()
|
||||
|
||||
# Summary
|
||||
print("\n=== URL Validation Summary ===")
|
||||
print(f"Total URLs processed: {total_urls}")
|
||||
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||
if invalid_urls:
|
||||
print("\nList of invalid URLs:")
|
||||
for url in invalid_urls:
|
||||
print(url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_jsonl_file(args.file_path)
|
||||
Reference in New Issue
Block a user