From 7af1c6acbeae935279df086af0b6730d58732aae Mon Sep 17 00:00:00 2001 From: root Date: Thu, 4 Dec 2025 11:35:17 +0800 Subject: [PATCH] refactor(crawler): use webdriver-manager and add debug logging --- crawler.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crawler.py b/crawler.py index f341277..9cafd6d 100644 --- a/crawler.py +++ b/crawler.py @@ -11,7 +11,9 @@ from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import TimeoutException, NoSuchElementException +from webdriver_manager.chrome import ChromeDriverManager import time from mastodon import Mastodon @@ -70,7 +72,8 @@ def setup_driver(): chrome_options.add_argument(f'user-agent={user_agent}') chrome_options.add_argument('--window-size=1920,1080') - driver = webdriver.Chrome(options=chrome_options) + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=chrome_options) return driver def crawler(): @@ -81,9 +84,16 @@ def crawler(): try: logger.info(f"Starting to crawl {home_url}") driver.get(home_url) - + # Wait for page to load time.sleep(3) + + # Debug: log page info + logger.info(f"Page title: {driver.title}") + logger.info(f"Current URL: {driver.current_url}") + page_source = driver.page_source + logger.info(f"Page source length: {len(page_source)}") + logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}") # Find all chiphell_box elements and get the last one boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')