Add url validity metrics

This commit is contained in:
William Jeynes
2026-04-04 20:02:25 +01:00
parent 43ecd04135
commit f821e9643d
2 changed files with 38 additions and 24 deletions
+7 -1
View File
@@ -19,4 +19,10 @@ Experiments with different model types:
| llama3.1:8b-instruct-q4_K_M | ? | ? | | llama3.1:8b-instruct-q4_K_M | ? | ? |
| qwen3.5:9b | 0 | -100 | | qwen3.5:9b | 0 | -100 |
%age correct URLS %age valid URLS
| Model | Number | % Age |
|-------------------------------|----------:|---------:|
| gpt-5-mini | 22/405 | 5.43 |
| gpt-5.4-mini | 29/278 | 10.43 |
| llama3.1:8b-instruct-q4_K_M | ? | ? |
| qwen3.5:9b | 0 | 0 |
+27 -19
View File
@@ -4,8 +4,7 @@ from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException, TimeoutException from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from tqdm import tqdm from tqdm import tqdm
def init_driver(): def init_driver():
@@ -32,9 +31,15 @@ def is_root_url(url):
return parsed.path in ("", "/") return parsed.path in ("", "/")
def is_404_page(driver): def is_404_page(driver):
"""Safely check for 404, handling stale elements."""
try:
title = driver.title.lower() title = driver.title.lower()
body_text = driver.find_element("tag name", "body").text.lower() body_text = driver.find_element("tag name", "body").text.lower()
return "404" in title or "404" in body_text return "404" in title or "404" in body_text
except StaleElementReferenceException:
return False
except Exception:
return False
def check_url_selenium(url): def check_url_selenium(url):
driver = None driver = None
@@ -43,34 +48,35 @@ def check_url_selenium(url):
driver.get(url) driver.get(url)
# 404 check # 404 check
if is_404_page(driver): if is_404_page(driver):
print("404") return False, "404 page detected"
return False
# Root URL after redirects # Root URL after redirects
final_url = driver.current_url final_url = driver.current_url
if is_root_url(final_url): if is_root_url(final_url):
print("ROOT") return False, f"Redirected to root URL ({final_url})"
return False return True, None
return True
except (WebDriverException, TimeoutException) as e: except (WebDriverException, TimeoutException) as e:
print(e) return False, str(e)
return False
finally: finally:
if driver: if driver:
driver.quit() driver.quit()
def process_event(event): def process_event(event):
"""Process an event only if score > 0.4."""
score = event.get("score", 0)
if score <= 0.4:
return None, False, "Score too low"
url = event.get("Url") url = event.get("Url")
if not url: if not url:
return None, False return None, False, "No URL"
is_valid = check_url_selenium(url) is_valid, error_msg = check_url_selenium(url)
event["url_valid"] = is_valid event["url_valid"] = is_valid
return url, is_valid return url, is_valid, error_msg
def process_jsonl_file(file_path, max_workers=4): def process_jsonl_file(file_path, max_workers=4):
invalid_urls = [] invalid_urls = []
valid_urls = 0 valid_urls = 0
# Gather all events to process # Gather events with score > 0.4
urls_to_check = [] urls_to_check = []
with open(file_path, "r", encoding="utf-8") as f: with open(file_path, "r", encoding="utf-8") as f:
for line in f: for line in f:
@@ -78,6 +84,7 @@ def process_jsonl_file(file_path, max_workers=4):
if line_data.get("status") != "success": if line_data.get("status") != "success":
continue continue
for event in line_data.get("events", []): for event in line_data.get("events", []):
if event.get("score", 0) > 0.4:
urls_to_check.append(event) urls_to_check.append(event)
total_urls = len(urls_to_check) total_urls = len(urls_to_check)
@@ -86,21 +93,22 @@ def process_jsonl_file(file_path, max_workers=4):
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check} future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"): for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
url, is_valid = future.result() url, is_valid, error_msg = future.result()
if not is_valid and url: if not is_valid and url:
invalid_urls.append(url) invalid_urls.append((url, error_msg))
else: else:
valid_urls += 1 valid_urls += 1
# Summary # Summary
if invalid_urls:
print("\nList of invalid URLs and reasons:")
for url, err in invalid_urls:
print(f"{url} --> {err}")
print("\n=== URL Validation Summary ===") print("\n=== URL Validation Summary ===")
print(f"Total URLs processed: {total_urls}") print(f"Total URLs processed: {total_urls}")
print(f"Valid URLs (loaded successfully): {valid_urls}") print(f"Valid URLs (loaded successfully): {valid_urls}")
print(f"Invalid URLs: {len(invalid_urls)}") print(f"Invalid URLs: {len(invalid_urls)}")
if invalid_urls:
print("\nList of invalid URLs:")
for url in invalid_urls:
print(url)
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium") parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")