add multithreading

2026-04-04 19:42:02 +01:00
parent 8c0921057b
commit 43ecd04135
1 changed files with 48 additions and 41 deletions
@@ -1,90 +1,96 @@
 import json
 import argparse
 from urllib.parse import urlparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import WebDriverException, TimeoutException
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+from tqdm import tqdm

 def init_driver():
    options = Options()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
+    options.add_argument("--headless")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")
+    prefs = {
+        "profile.managed_default_content_settings.images": 2,  # block images
+        "profile.default_content_setting_values.stylesheets": 2,  # block CSS
+        "profile.managed_default_content_settings.cookies": 2,  # optional
+    }
+    options.add_experimental_option("prefs", prefs)
+
    driver = webdriver.Chrome(options=options)
-    driver.set_page_load_timeout(15)
+    driver.set_page_load_timeout(30)
    return driver

 def is_root_url(url):
-    """Return True if URL is a root domain with no path."""
    parsed = urlparse(url)
    return parsed.path in ("", "/")

 def is_404_page(driver):
-    """Check for 404 indicators in title or body text."""
    title = driver.title.lower()
    body_text = driver.find_element("tag name", "body").text.lower()
-    if "404" in title or "not found" in title:
-        return True
-    if "404" in body_text or "not found" in body_text:
-        return True
-    return False
+    return "404" in title or "404" in body_text

-def check_url_selenium(driver, url):
-    """
-    Check if a URL is valid:
-    - Loads without error
-    - Not a 404 page
-    - Not redirected to a root domain
-    """
+def check_url_selenium(url):
+    driver = None
    try:
+        driver = init_driver()
        driver.get(url)
-        # 1️⃣ Check if page is 404
+        # 404 check
        if is_404_page(driver):
-            print(f"Page returned 404: {url}")
+            print("404")
            return False
-        # 2️⃣ Check if current URL after redirects is a root URL
+        # Root URL after redirects
        final_url = driver.current_url
        if is_root_url(final_url):
-            print(f"Redirected to root URL (invalid): {final_url}")
+            print("ROOT")
            return False
        return True
    except (WebDriverException, TimeoutException) as e:
-        print(f"Error accessing URL {url}: {e}")
+        print(e)
        return False
+    finally:
+        if driver:
+            driver.quit()

-def process_event(event, driver, invalid_urls):
+def process_event(event):
    url = event.get("Url")
-    if url:
-        is_valid = check_url_selenium(driver, url)
-        event["url_valid"] = is_valid
-        if not is_valid:
-            invalid_urls.append(url)
-        return is_valid
-    return False
+    if not url:
+        return None, False
+    is_valid = check_url_selenium(url)
+    event["url_valid"] = is_valid
+    return url, is_valid

-def process_jsonl_file(file_path):
-    total_urls = 0
-    valid_urls = 0
+def process_jsonl_file(file_path, max_workers=4):
    invalid_urls = []
+    valid_urls = 0

-    driver = init_driver()
-
+    # Gather all events to process
+    urls_to_check = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line_data = json.loads(line)
-
            if line_data.get("status") != "success":
                continue
+            for event in line_data.get("events", []):
+                urls_to_check.append(event)

-            events = line_data.get("events", [])
-            for event in events:
-                total_urls += 1
-                if process_event(event, driver, invalid_urls):
-                    valid_urls += 1
+    total_urls = len(urls_to_check)

-    driver.quit()
+    # ThreadPoolExecutor with tqdm progress bar
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
+        for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
+            url, is_valid = future.result()
+            if not is_valid and url:
+                invalid_urls.append(url)
+            else:
+                valid_urls += 1

    # Summary
    print("\n=== URL Validation Summary ===")
@@ -99,6 +105,7 @@ def process_jsonl_file(file_path):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
    parser.add_argument("file_path", type=str, help="Path to the JSONL file")
+    parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
    args = parser.parse_args()

-    process_jsonl_file(args.file_path)
+    process_jsonl_file(args.file_path, max_workers=args.workers)