start on work to calculate % if valid URLS

This commit is contained in:
William Jeynes
2026-04-04 18:52:47 +01:00
parent b610e8c989
commit 8c0921057b
2 changed files with 124 additions and 1 deletions
+20 -1
View File
@@ -1,3 +1,22 @@
## Refining the agent output ## Refining the agent output
TODO: Table and document experiments Experiments modifying pipeline
| Model | % Correct | % Change |
|------------------|----------:|---------:|
| BASELINE | 33 | 0 |
| Improv Prompt | 39.96 | 0.21 |
| Add Examples | 44.67 | 0.35 |
| Date | 45.51 | 0.38 |
| Chain of Thought | 43.38 | 0.31 |
| Self-Critique | 44.36 | 0.34 |
Experiments with different model types:
| Model | % Correct | % Change |
|-------------------------------|----------:|---------:|
| gpt-5-mini | 33 | 0 |
| gpt-5.4-mini | 32.4 | -0.02 |
| llama3.1:8b-instruct-q4_K_M | ? | ? |
| qwen3.5:9b | 0 | -100 |
%age correct URLS
+104
View File
@@ -0,0 +1,104 @@
import json
import argparse
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException, TimeoutException
def init_driver():
options = Options()
options.headless = True
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(15)
return driver
def is_root_url(url):
"""Return True if URL is a root domain with no path."""
parsed = urlparse(url)
return parsed.path in ("", "/")
def is_404_page(driver):
"""Check for 404 indicators in title or body text."""
title = driver.title.lower()
body_text = driver.find_element("tag name", "body").text.lower()
if "404" in title or "not found" in title:
return True
if "404" in body_text or "not found" in body_text:
return True
return False
def check_url_selenium(driver, url):
"""
Check if a URL is valid:
- Loads without error
- Not a 404 page
- Not redirected to a root domain
"""
try:
driver.get(url)
# 1️⃣ Check if page is 404
if is_404_page(driver):
print(f"Page returned 404: {url}")
return False
# 2️⃣ Check if current URL after redirects is a root URL
final_url = driver.current_url
if is_root_url(final_url):
print(f"Redirected to root URL (invalid): {final_url}")
return False
return True
except (WebDriverException, TimeoutException) as e:
print(f"Error accessing URL {url}: {e}")
return False
def process_event(event, driver, invalid_urls):
url = event.get("Url")
if url:
is_valid = check_url_selenium(driver, url)
event["url_valid"] = is_valid
if not is_valid:
invalid_urls.append(url)
return is_valid
return False
def process_jsonl_file(file_path):
total_urls = 0
valid_urls = 0
invalid_urls = []
driver = init_driver()
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
line_data = json.loads(line)
if line_data.get("status") != "success":
continue
events = line_data.get("events", [])
for event in events:
total_urls += 1
if process_event(event, driver, invalid_urls):
valid_urls += 1
driver.quit()
# Summary
print("\n=== URL Validation Summary ===")
print(f"Total URLs processed: {total_urls}")
print(f"Valid URLs (loaded successfully): {valid_urls}")
print(f"Invalid URLs: {len(invalid_urls)}")
if invalid_urls:
print("\nList of invalid URLs:")
for url in invalid_urls:
print(url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
args = parser.parse_args()
process_jsonl_file(args.file_path)