From f821e9643d2f75b1fdc09a4c26a08e1c2a2c0abf Mon Sep 17 00:00:00 2001
From: William Jeynes <william.jeynes@hotmail.com>
Date: Sat, 4 Apr 2026 20:02:25 +0100
Subject: [PATCH] Add url validity metrics

---
 agent/README.md               |  8 +++++-
 supporting/checker/checker.py | 54 ++++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/agent/README.md b/agent/README.md
index 57b1c74..88e9fbe 100644
--- a/agent/README.md
+++ b/agent/README.md
@@ -19,4 +19,10 @@ Experiments with different model types:
 | llama3.1:8b-instruct-q4_K_M   | ?         | ?        |
 | qwen3.5:9b                    | 0         | -100     |
 
-%age correct URLS
\ No newline at end of file
+%age valid URLS
+| Model                         | Number    | % Age    |
+|-------------------------------|----------:|---------:|
+| gpt-5-mini                    | 22/405    | 5.43     |
+| gpt-5.4-mini                  | 29/278    | 10.43    |
+| llama3.1:8b-instruct-q4_K_M   | ?         | ?        |
+| qwen3.5:9b                    | 0         | 0        |
\ No newline at end of file
diff --git a/supporting/checker/checker.py b/supporting/checker/checker.py
index fa9b092..a8f1359 100644
--- a/supporting/checker/checker.py
+++ b/supporting/checker/checker.py
@@ -4,8 +4,7 @@ from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import WebDriverException, TimeoutException
-from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
 from tqdm import tqdm
 
 def init_driver():
@@ -32,9 +31,15 @@ def is_root_url(url):
     return parsed.path in ("", "/")
 
 def is_404_page(driver):
-    title = driver.title.lower()
-    body_text = driver.find_element("tag name", "body").text.lower()
-    return "404" in title or "404" in body_text
+    """Safely check for 404, handling stale elements."""
+    try:
+        title = driver.title.lower()
+        body_text = driver.find_element("tag name", "body").text.lower()
+        return "404" in title or "404" in body_text
+    except StaleElementReferenceException:
+        return False
+    except Exception:
+        return False
 
 def check_url_selenium(url):
     driver = None
@@ -43,34 +48,35 @@ def check_url_selenium(url):
         driver.get(url)
         # 404 check
         if is_404_page(driver):
-            print("404")
-            return False
+            return False, "404 page detected"
         # Root URL after redirects
         final_url = driver.current_url
         if is_root_url(final_url):
-            print("ROOT")
-            return False
-        return True
+            return False, f"Redirected to root URL ({final_url})"
+        return True, None
     except (WebDriverException, TimeoutException) as e:
-        print(e)
-        return False
+        return False, str(e)
     finally:
         if driver:
             driver.quit()
 
 def process_event(event):
+    """Process an event only if score > 0.4."""
+    score = event.get("score", 0)
+    if score <= 0.4:
+        return None, False, "Score too low"
     url = event.get("Url")
     if not url:
-        return None, False
-    is_valid = check_url_selenium(url)
+        return None, False, "No URL"
+    is_valid, error_msg = check_url_selenium(url)
     event["url_valid"] = is_valid
-    return url, is_valid
+    return url, is_valid, error_msg
 
 def process_jsonl_file(file_path, max_workers=4):
     invalid_urls = []
     valid_urls = 0
 
-    # Gather all events to process
+    # Gather events with score > 0.4
     urls_to_check = []
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
@@ -78,7 +84,8 @@ def process_jsonl_file(file_path, max_workers=4):
             if line_data.get("status") != "success":
                 continue
             for event in line_data.get("events", []):
-                urls_to_check.append(event)
+                if event.get("score", 0) > 0.4:
+                    urls_to_check.append(event)
 
     total_urls = len(urls_to_check)
 
@@ -86,21 +93,22 @@ def process_jsonl_file(file_path, max_workers=4):
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
         for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
-            url, is_valid = future.result()
+            url, is_valid, error_msg = future.result()
             if not is_valid and url:
-                invalid_urls.append(url)
+                invalid_urls.append((url, error_msg))
             else:
                 valid_urls += 1
 
     # Summary
+    if invalid_urls:
+        print("\nList of invalid URLs and reasons:")
+        for url, err in invalid_urls:
+            print(f"{url} --> {err}")
     print("\n=== URL Validation Summary ===")
     print(f"Total URLs processed: {total_urls}")
     print(f"Valid URLs (loaded successfully): {valid_urls}")
     print(f"Invalid URLs: {len(invalid_urls)}")
-    if invalid_urls:
-        print("\nList of invalid URLs:")
-        for url in invalid_urls:
-            print(url)
+    
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")