119 lines
4.2 KiB
Python
119 lines
4.2 KiB
Python
import json
|
|
import argparse
|
|
from urllib.parse import urlparse
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException
|
|
from tqdm import tqdm
|
|
|
|
def init_driver():
|
|
options = Options()
|
|
options.headless = True
|
|
options.add_argument("--disable-gpu")
|
|
options.add_argument("--no-sandbox")
|
|
options.add_argument("--headless")
|
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
options.add_argument("--window-size=1920,1080")
|
|
prefs = {
|
|
"profile.managed_default_content_settings.images": 2, # block images
|
|
"profile.default_content_setting_values.stylesheets": 2, # block CSS
|
|
"profile.managed_default_content_settings.cookies": 2, # optional
|
|
}
|
|
options.add_experimental_option("prefs", prefs)
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
driver.set_page_load_timeout(30)
|
|
return driver
|
|
|
|
def is_root_url(url):
|
|
parsed = urlparse(url)
|
|
return parsed.path in ("", "/")
|
|
|
|
def is_404_page(driver):
|
|
"""Safely check for 404, handling stale elements."""
|
|
try:
|
|
title = driver.title.lower()
|
|
body_text = driver.find_element("tag name", "body").text.lower()
|
|
return "404" in title or "404" in body_text
|
|
except StaleElementReferenceException:
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
def check_url_selenium(url):
|
|
driver = None
|
|
try:
|
|
driver = init_driver()
|
|
driver.get(url)
|
|
# 404 check
|
|
if is_404_page(driver):
|
|
return False, "404 page detected"
|
|
# Root URL after redirects
|
|
final_url = driver.current_url
|
|
if is_root_url(final_url):
|
|
return False, f"Redirected to root URL ({final_url})"
|
|
return True, None
|
|
except (WebDriverException, TimeoutException) as e:
|
|
return False, str(e)
|
|
finally:
|
|
if driver:
|
|
driver.quit()
|
|
|
|
def process_event(event):
|
|
"""Process an event only if score > 0.4."""
|
|
score = event.get("score", 0)
|
|
if score <= 0.4:
|
|
return None, False, "Score too low"
|
|
url = event.get("Url")
|
|
if not url:
|
|
return None, False, "No URL"
|
|
is_valid, error_msg = check_url_selenium(url)
|
|
event["url_valid"] = is_valid
|
|
return url, is_valid, error_msg
|
|
|
|
def process_jsonl_file(file_path, max_workers=4):
|
|
invalid_urls = []
|
|
valid_urls = 0
|
|
|
|
# Gather events with score > 0.4
|
|
urls_to_check = []
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
for line in f:
|
|
line_data = json.loads(line)
|
|
if line_data.get("status") != "success":
|
|
continue
|
|
for event in line_data.get("events", []):
|
|
if event.get("score", 0) > 0.4:
|
|
urls_to_check.append(event)
|
|
|
|
total_urls = len(urls_to_check)
|
|
|
|
# ThreadPoolExecutor with tqdm progress bar
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
|
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
|
url, is_valid, error_msg = future.result()
|
|
if not is_valid and url:
|
|
invalid_urls.append((url, error_msg))
|
|
else:
|
|
valid_urls += 1
|
|
|
|
# Summary
|
|
if invalid_urls:
|
|
print("\nList of invalid URLs and reasons:")
|
|
for url, err in invalid_urls:
|
|
print(f"{url} --> {err}")
|
|
print("\n=== URL Validation Summary ===")
|
|
print(f"Total URLs processed: {total_urls}")
|
|
print(f"Valid URLs (loaded successfully): {valid_urls}")
|
|
print(f"Invalid URLs: {len(invalid_urls)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
|
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
|
parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
|
|
args = parser.parse_args()
|
|
|
|
process_jsonl_file(args.file_path, max_workers=args.workers) |