add multithreading
This commit is contained in:
@@ -1,90 +1,96 @@
|
|||||||
import json
|
import json
|
||||||
import argparse
|
import argparse
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.common.exceptions import WebDriverException, TimeoutException
|
from selenium.common.exceptions import WebDriverException, TimeoutException
|
||||||
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
def init_driver():
|
def init_driver():
|
||||||
options = Options()
|
options = Options()
|
||||||
options.headless = True
|
options.headless = True
|
||||||
options.add_argument("--disable-gpu")
|
options.add_argument("--disable-gpu")
|
||||||
options.add_argument("--no-sandbox")
|
options.add_argument("--no-sandbox")
|
||||||
|
options.add_argument("--headless")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
options.add_argument("--window-size=1920,1080")
|
options.add_argument("--window-size=1920,1080")
|
||||||
|
prefs = {
|
||||||
|
"profile.managed_default_content_settings.images": 2, # block images
|
||||||
|
"profile.default_content_setting_values.stylesheets": 2, # block CSS
|
||||||
|
"profile.managed_default_content_settings.cookies": 2, # optional
|
||||||
|
}
|
||||||
|
options.add_experimental_option("prefs", prefs)
|
||||||
|
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Chrome(options=options)
|
||||||
driver.set_page_load_timeout(15)
|
driver.set_page_load_timeout(30)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def is_root_url(url):
|
def is_root_url(url):
|
||||||
"""Return True if URL is a root domain with no path."""
|
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
return parsed.path in ("", "/")
|
return parsed.path in ("", "/")
|
||||||
|
|
||||||
def is_404_page(driver):
|
def is_404_page(driver):
|
||||||
"""Check for 404 indicators in title or body text."""
|
|
||||||
title = driver.title.lower()
|
title = driver.title.lower()
|
||||||
body_text = driver.find_element("tag name", "body").text.lower()
|
body_text = driver.find_element("tag name", "body").text.lower()
|
||||||
if "404" in title or "not found" in title:
|
return "404" in title or "404" in body_text
|
||||||
return True
|
|
||||||
if "404" in body_text or "not found" in body_text:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def check_url_selenium(driver, url):
|
def check_url_selenium(url):
|
||||||
"""
|
driver = None
|
||||||
Check if a URL is valid:
|
|
||||||
- Loads without error
|
|
||||||
- Not a 404 page
|
|
||||||
- Not redirected to a root domain
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
|
driver = init_driver()
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
# 1️⃣ Check if page is 404
|
# 404 check
|
||||||
if is_404_page(driver):
|
if is_404_page(driver):
|
||||||
print(f"Page returned 404: {url}")
|
print("404")
|
||||||
return False
|
return False
|
||||||
# 2️⃣ Check if current URL after redirects is a root URL
|
# Root URL after redirects
|
||||||
final_url = driver.current_url
|
final_url = driver.current_url
|
||||||
if is_root_url(final_url):
|
if is_root_url(final_url):
|
||||||
print(f"Redirected to root URL (invalid): {final_url}")
|
print("ROOT")
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
except (WebDriverException, TimeoutException) as e:
|
except (WebDriverException, TimeoutException) as e:
|
||||||
print(f"Error accessing URL {url}: {e}")
|
print(e)
|
||||||
return False
|
return False
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
def process_event(event, driver, invalid_urls):
|
def process_event(event):
|
||||||
url = event.get("Url")
|
url = event.get("Url")
|
||||||
if url:
|
if not url:
|
||||||
is_valid = check_url_selenium(driver, url)
|
return None, False
|
||||||
event["url_valid"] = is_valid
|
is_valid = check_url_selenium(url)
|
||||||
if not is_valid:
|
event["url_valid"] = is_valid
|
||||||
invalid_urls.append(url)
|
return url, is_valid
|
||||||
return is_valid
|
|
||||||
return False
|
|
||||||
|
|
||||||
def process_jsonl_file(file_path):
|
def process_jsonl_file(file_path, max_workers=4):
|
||||||
total_urls = 0
|
|
||||||
valid_urls = 0
|
|
||||||
invalid_urls = []
|
invalid_urls = []
|
||||||
|
valid_urls = 0
|
||||||
|
|
||||||
driver = init_driver()
|
# Gather all events to process
|
||||||
|
urls_to_check = []
|
||||||
with open(file_path, "r", encoding="utf-8") as f:
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
line_data = json.loads(line)
|
line_data = json.loads(line)
|
||||||
|
|
||||||
if line_data.get("status") != "success":
|
if line_data.get("status") != "success":
|
||||||
continue
|
continue
|
||||||
|
for event in line_data.get("events", []):
|
||||||
|
urls_to_check.append(event)
|
||||||
|
|
||||||
events = line_data.get("events", [])
|
total_urls = len(urls_to_check)
|
||||||
for event in events:
|
|
||||||
total_urls += 1
|
|
||||||
if process_event(event, driver, invalid_urls):
|
|
||||||
valid_urls += 1
|
|
||||||
|
|
||||||
driver.quit()
|
# ThreadPoolExecutor with tqdm progress bar
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
future_to_event = {executor.submit(process_event, e): e for e in urls_to_check}
|
||||||
|
for future in tqdm(as_completed(future_to_event), total=total_urls, desc="Checking URLs"):
|
||||||
|
url, is_valid = future.result()
|
||||||
|
if not is_valid and url:
|
||||||
|
invalid_urls.append(url)
|
||||||
|
else:
|
||||||
|
valid_urls += 1
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
print("\n=== URL Validation Summary ===")
|
print("\n=== URL Validation Summary ===")
|
||||||
@@ -99,6 +105,7 @@ def process_jsonl_file(file_path):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
parser = argparse.ArgumentParser(description="Validate URLs in JSONL file events using Selenium")
|
||||||
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
parser.add_argument("file_path", type=str, help="Path to the JSONL file")
|
||||||
|
parser.add_argument("--workers", type=int, default=4, help="Number of parallel Selenium workers")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
process_jsonl_file(args.file_path)
|
process_jsonl_file(args.file_path, max_workers=args.workers)
|
||||||
Reference in New Issue
Block a user