start on work to calculate % if valid URLS
This commit is contained in:
@@ -0,0 +1,104 @@
|
||||
import json
|
||||
import argparse
|
||||
from urllib.parse import urlparse
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import WebDriverException, TimeoutException
|
||||
|
||||
def init_driver():
|
||||
options = Options()
|
||||
options.headless = True
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_argument("--window-size=1920,1080")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.set_page_load_timeout(15)
|
||||
return driver
|
||||
|
||||
def is_root_url(url):
|
||||
"""Return True if URL is a root domain with no path."""
|
||||
parsed = urlparse(url)
|
||||
return parsed.path in ("", "/")
|
||||
|
||||
def is_404_page(driver):
|
||||
"""Check for 404 indicators in title or body text."""
|
||||
title = driver.title.lower()
|
||||
body_text = driver.find_element("tag name", "body").text.lower()
|
||||
if "404" in title or "not found" in title:
|
||||
return True
|
||||
if "404" in body_text or "not found" in body_text:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_url_selenium(driver, url):
|
||||
"""
|
||||
Check if a URL is valid:
|
||||
- Loads without error
|
||||
- Not a 404 page
|
||||
- Not redirected to a root domain
|
||||
"""
|
||||
try:
|
||||
driver.get(url)
|
||||
# 1️⃣ Check if page is 404
|
||||
if is_404_page(driver):
|
||||
print(f"Page returned 404: {url}")
|
||||
return False
|
||||
# 2️⃣ Check if current URL after redirects is a root URL
|
||||
final_url = driver.current_url
|
||||
if is_root_url(final_url):
|
||||
print(f"Redirected to root URL (invalid): {final_url}")
|
||||
return False
|
||||
return True
|
||||
except (WebDriverException, TimeoutException) as e:
|
||||
print(f"Error accessing URL {url}: {e}")
|
||||
return False
|
||||
|
||||
def process_event(event, driver, invalid_urls):
|
||||
url = event.get("Url")
|
||||
if url:
|
||||
is_valid = check_url_selenium(driver, url)
|
||||
event["url_valid"] = is_valid
|
||||
if not is_valid:
|
||||
invalid_urls.append(url)
|
||||
return is_valid
|
||||
return False
|
||||
|
||||
def process_jsonl_file(file_path):
|
||||
total_urls = 0
|
||||
valid_urls = 0
|
||||
invalid_urls = []
|
||||
|
||||
driver = init_driver()
|
||||
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line_data = json.loads(line)
|
||||
|
||||
if line_data.get("status") != "success":
|
||||
continue
|
||||
|
||||
events = line_data.get("events", [])
|
||||
for event in events:
|
||||
total_urls += 1
|
||||
if process_event(event, driver, invalid_urls):
|
||||
valid_urls += 1
|
||||
|
||||
driver.quit()
|
||||
|
||||
# Summary
|
||||
print("\n=== URL Validation Summary ===")
|
||||
print(f"Total URLs processed: {total_urls}")
|
||||
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
||||
print(f"Invalid URLs: {len(invalid_urls)}")
|
||||
if invalid_urls:
|
||||
print("\nList of invalid URLs:")
|
||||
for url in invalid_urls:
|
||||
print(url)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_jsonl_file(args.file_path)
|
||||
Reference in New Issue
Block a user