refactor(crawler): use webdriver-manager and add debug logging

This commit is contained in:
root 2025-12-04 11:35:17 +08:00
parent 6b5c4a3a1b
commit 7af1c6acbe

View File

@ -11,7 +11,9 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time import time
from mastodon import Mastodon from mastodon import Mastodon
@ -70,7 +72,8 @@ def setup_driver():
chrome_options.add_argument(f'user-agent={user_agent}') chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--window-size=1920,1080') chrome_options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=chrome_options) service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver return driver
def crawler(): def crawler():
@ -81,9 +84,16 @@ def crawler():
try: try:
logger.info(f"Starting to crawl {home_url}") logger.info(f"Starting to crawl {home_url}")
driver.get(home_url) driver.get(home_url)
# Wait for page to load # Wait for page to load
time.sleep(3) time.sleep(3)
# Debug: log page info
logger.info(f"Page title: {driver.title}")
logger.info(f"Current URL: {driver.current_url}")
page_source = driver.page_source
logger.info(f"Page source length: {len(page_source)}")
logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")
# Find all chiphell_box elements and get the last one # Find all chiphell_box elements and get the last one
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box') boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')