start on work to calculate % if valid URLS

2026-04-04 18:52:47 +01:00
parent b610e8c989
commit 8c0921057b
2 changed files with 124 additions and 1 deletions
@@ -1,3 +1,22 @@
 ## Refining the agent output
-TODO: Table and document experiments
+Experiments modifying pipeline
 | Model            | % Correct | % Change |
 |------------------|----------:|---------:|
 | BASELINE         | 33        | 0        |
 | Improv Prompt    | 39.96     | 0.21     |
 | Add Examples     | 44.67     | 0.35     |
 | Date             | 45.51     | 0.38     |
 | Chain of Thought | 43.38     | 0.31     |
 | Self-Critique    | 44.36     | 0.34     |
 Experiments with different model types:
 | Model                         | % Correct | % Change |
 |-------------------------------|----------:|---------:|
 | gpt-5-mini                    | 33        | 0        |
 | gpt-5.4-mini                  | 32.4      | -0.02    |
 | llama3.1:8b-instruct-q4_K_M   | ?         | ?        |
 | qwen3.5:9b                    | 0         | -100     |
 %age correct URLS
@@ -0,0 +1,104 @@
 import json
 import argparse
 from urllib.parse import urlparse
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import WebDriverException, TimeoutException
 def init_driver():
    options = Options()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)
    driver.set_page_load_timeout(15)
    return driver
 def is_root_url(url):
    """Return True if URL is a root domain with no path."""
    parsed = urlparse(url)
    return parsed.path in ("", "/")
 def is_404_page(driver):
    """Check for 404 indicators in title or body text."""
    title = driver.title.lower()
    body_text = driver.find_element("tag name", "body").text.lower()
    if "404" in title or "not found" in title:
        return True
    if "404" in body_text or "not found" in body_text:
        return True
    return False
 def check_url_selenium(driver, url):
    """
    Check if a URL is valid:
    - Loads without error
    - Not a 404 page
    - Not redirected to a root domain
    """
    try:
        driver.get(url)
        # 1️⃣ Check if page is 404
        if is_404_page(driver):
            print(f"Page returned 404: {url}")
            return False
        # 2️⃣ Check if current URL after redirects is a root URL
        final_url = driver.current_url
        if is_root_url(final_url):
            print(f"Redirected to root URL (invalid): {final_url}")
            return False
        return True
    except (WebDriverException, TimeoutException) as e:
        print(f"Error accessing URL {url}: {e}")
        return False
 def process_event(event, driver, invalid_urls):
    url = event.get("Url")
    if url:
        is_valid = check_url_selenium(driver, url)
        event["url_valid"] = is_valid
        if not is_valid:
            invalid_urls.append(url)
        return is_valid
    return False
 def process_jsonl_file(file_path):
    total_urls = 0
    valid_urls = 0
    invalid_urls = []
    driver = init_driver()
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line_data = json.loads(line)
            if line_data.get("status") != "success":
                continue
            events = line_data.get("events", [])
            for event in events:
                total_urls += 1
                if process_event(event, driver, invalid_urls):
                    valid_urls += 1
    driver.quit()
    # Summary
    print("\n=== URL Validation Summary ===")
    print(f"Total URLs processed: {total_urls}")
    print(f"Valid URLs (loaded successfully): {valid_urls}")
    print(f"Invalid URLs: {len(invalid_urls)}")
    if invalid_urls:
        print("\nList of invalid URLs:")
        for url in invalid_urls:
            print(url)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
    parser.add_argument("file_path", type=str, help="Path to the JSONL file")
    args = parser.parse_args()
    process_jsonl_file(args.file_path)