diff --git a/supporting/checker/checker.py b/supporting/checker/checker.py index 6749066..fa9b092 100644 --- a/supporting/checker/checker.py +++ b/supporting/checker/checker.py @@ -1,90 +1,96 @@ import json import argparse from urllib.parse import urlparse +from concurrent.futures import ThreadPoolExecutor, as_completed from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import WebDriverException, TimeoutException +from selenium.webdriver.common.desired_capabilities import DesiredCapabilities +from tqdm import tqdm def init_driver(): options = Options() options.headless = True options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") + options.add_argument("--headless") options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--window-size=1920,1080") + prefs = { + "profile.managed_default_content_settings.images": 2, # block images + "profile.default_content_setting_values.stylesheets": 2, # block CSS + "profile.managed_default_content_settings.cookies": 2, # optional + } + options.add_experimental_option("prefs", prefs) + driver = webdriver.Chrome(options=options) - driver.set_page_load_timeout(15) + driver.set_page_load_timeout(30) return driver def is_root_url(url): - """Return True if URL is a root domain with no path.""" parsed = urlparse(url) return parsed.path in ("", "/") def is_404_page(driver): - """Check for 404 indicators in title or body text.""" title = driver.title.lower() body_text = driver.find_element("tag name", "body").text.lower() - if "404" in title or "not found" in title: - return True - if "404" in body_text or "not found" in body_text: - return True - return False + return "404" in title or "404" in body_text -def check_url_selenium(driver, url): - """ - Check if a URL is valid: - - Loads without error - - Not a 404 page - - Not redirected to a root domain - """ +def check_url_selenium(url): + driver = None try: + driver = init_driver() driver.get(url) - # 1️⃣ Check if page is 404 + # 404 check if is_404_page(driver): - print(f"Page returned 404: {url}") + print("404") return False - # 2️⃣ Check if current URL after redirects is a root URL + # Root URL after redirects final_url = driver.current_url if is_root_url(final_url): - print(f"Redirected to root URL (invalid): {final_url}") + print("ROOT") return False return True except (WebDriverException, TimeoutException) as e: - print(f"Error accessing URL {url}: {e}") + print(e) return False + finally: + if driver: + driver.quit() -def process_event(event, driver, invalid_urls): +def process_event(event): url = event.get("Url") - if url: - is_valid = check_url_selenium(driver, url) - event["url_valid"] = is_valid - if not is_valid: - invalid_urls.append(url) - return is_valid - return False + if not url: + return None, False + is_valid = check_url_selenium(url) + event["url_valid"] = is_valid + return url, is_valid -def process_jsonl_file(file_path): - total_urls = 0 - valid_urls = 0 +def process_jsonl_file(file_path, max_workers=4): invalid_urls = [] + valid_urls = 0 - driver = init_driver() - + # Gather all events to process + urls_to_check = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: line_data = json.loads(line) - if line_data.get("status") != "success": continue + for event in line_data.get("events", []): + urls_to_check.append(event) - events = line_data.get("events", []) - for event in events: - total_urls += 1 - if process_event(event, driver, invalid_urls): - valid_urls += 1 + total_urls = len(urls_to_check) - driver.quit() + # ThreadPoolExecutor with tqdm progress bar + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_event = {executor.submit(process_event, e): e for e in urls_to_check} + for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"): + url, is_valid = future.result() + if not is_valid and url: + invalid_urls.append(url) + else: + valid_urls += 1 # Summary print("\n=== URL Validation Summary ===") @@ -99,6 +105,7 @@ def process_jsonl_file(file_path): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium") parser.add_argument("file_path", type=str, help="Path to the JSONL file") + parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers") args = parser.parse_args() - process_jsonl_file(args.file_path) \ No newline at end of file + process_jsonl_file(args.file_path, max_workers=args.workers) \ No newline at end of file