Add url validity metrics
This commit is contained in:
+7
-1
@@ -19,4 +19,10 @@ Experiments with different model types:
|
|||||||
| llama3.1:8b-instruct-q4_K_M | ? | ? |
|
| llama3.1:8b-instruct-q4_K_M | ? | ? |
|
||||||
| qwen3.5:9b | 0 | -100 |
|
| qwen3.5:9b | 0 | -100 |
|
||||||
|
|
||||||
%age correct URLS
|
%age valid URLS
|
||||||
|
| Model | Number | % Age |
|
||||||
|
|-------------------------------|----------:|---------:|
|
||||||
|
| gpt-5-mini | 22/405 | 5.43 |
|
||||||
|
| gpt-5.4-mini | 29/278 | 10.43 |
|
||||||
|
| llama3.1:8b-instruct-q4_K_M | ? | ? |
|
||||||
|
| qwen3.5:9b | 0 | 0 |
|
||||||
@@ -4,8 +4,7 @@ from urllib.parse import urlparse
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import WebDriverException, TimeoutException
|
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
|
||||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
def init_driver():
|
def init_driver():
|
||||||
@@ -32,9 +31,15 @@ def is_root_url(url):
|
|||||||
return parsed.path in ("", "/")
|
return parsed.path in ("", "/")
|
||||||
|
|
||||||
def is_404_page(driver):
|
def is_404_page(driver):
|
||||||
title = driver.title.lower()
|
"""Safely check for 404, handling stale elements."""
|
||||||
body_text = driver.find_element("tag name", "body").text.lower()
|
try:
|
||||||
return "404" in title or "404" in body_text
|
title = driver.title.lower()
|
||||||
|
body_text = driver.find_element("tag name", "body").text.lower()
|
||||||
|
return "404" in title or "404" in body_text
|
||||||
|
except StaleElementReferenceException:
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
def check_url_selenium(url):
|
def check_url_selenium(url):
|
||||||
driver = None
|
driver = None
|
||||||
@@ -43,34 +48,35 @@ def check_url_selenium(url):
|
|||||||
driver.get(url)
|
driver.get(url)
|
||||||
# 404 check
|
# 404 check
|
||||||
if is_404_page(driver):
|
if is_404_page(driver):
|
||||||
print("404")
|
return False, "404 page detected"
|
||||||
return False
|
|
||||||
# Root URL after redirects
|
# Root URL after redirects
|
||||||
final_url = driver.current_url
|
final_url = driver.current_url
|
||||||
if is_root_url(final_url):
|
if is_root_url(final_url):
|
||||||
print("ROOT")
|
return False, f"Redirected to root URL ({final_url})"
|
||||||
return False
|
return True, None
|
||||||
return True
|
|
||||||
except (WebDriverException, TimeoutException) as e:
|
except (WebDriverException, TimeoutException) as e:
|
||||||
print(e)
|
return False, str(e)
|
||||||
return False
|
|
||||||
finally:
|
finally:
|
||||||
if driver:
|
if driver:
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
def process_event(event):
|
def process_event(event):
|
||||||
|
"""Process an event only if score > 0.4."""
|
||||||
|
score = event.get("score", 0)
|
||||||
|
if score <= 0.4:
|
||||||
|
return None, False, "Score too low"
|
||||||
url = event.get("Url")
|
url = event.get("Url")
|
||||||
if not url:
|
if not url:
|
||||||
return None, False
|
return None, False, "No URL"
|
||||||
is_valid = check_url_selenium(url)
|
is_valid, error_msg = check_url_selenium(url)
|
||||||
event["url_valid"] = is_valid
|
event["url_valid"] = is_valid
|
||||||
return url, is_valid
|
return url, is_valid, error_msg
|
||||||
|
|
||||||
def process_jsonl_file(file_path, max_workers=4):
|
def process_jsonl_file(file_path, max_workers=4):
|
||||||
invalid_urls = []
|
invalid_urls = []
|
||||||
valid_urls = 0
|
valid_urls = 0
|
||||||
|
|
||||||
# Gather all events to process
|
# Gather events with score > 0.4
|
||||||
urls_to_check = []
|
urls_to_check = []
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
@@ -78,7 +84,8 @@ def process_jsonl_file(file_path, max_workers=4):
|
|||||||
if line_data.get("status") != "success":
|
if line_data.get("status") != "success":
|
||||||
continue
|
continue
|
||||||
for event in line_data.get("events", []):
|
for event in line_data.get("events", []):
|
||||||
urls_to_check.append(event)
|
if event.get("score", 0) > 0.4:
|
||||||
|
urls_to_check.append(event)
|
||||||
|
|
||||||
total_urls = len(urls_to_check)
|
total_urls = len(urls_to_check)
|
||||||
|
|
||||||
@@ -86,21 +93,22 @@ def process_jsonl_file(file_path, max_workers=4):
|
|||||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
||||||
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
||||||
url, is_valid = future.result()
|
url, is_valid, error_msg = future.result()
|
||||||
if not is_valid and url:
|
if not is_valid and url:
|
||||||
invalid_urls.append(url)
|
invalid_urls.append((url, error_msg))
|
||||||
else:
|
else:
|
||||||
valid_urls += 1
|
valid_urls += 1
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
|
if invalid_urls:
|
||||||
|
print("\nList of invalid URLs and reasons:")
|
||||||
|
for url, err in invalid_urls:
|
||||||
|
print(f"{url} --> {err}")
|
||||||
print("\n=== URL Validation Summary ===")
|
print("\n=== URL Validation Summary ===")
|
||||||
print(f"Total URLs processed: {total_urls}")
|
print(f"Total URLs processed: {total_urls}")
|
||||||
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||||
print(f"Invalid URLs: {len(invalid_urls)}")
|
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||||
if invalid_urls:
|
|
||||||
print("\nList of invalid URLs:")
|
|
||||||
for url in invalid_urls:
|
|
||||||
print(url)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||||
|
|||||||
Reference in New Issue
Block a user