refactor(crawler): use webdriver-manager and add debug logging
This commit is contained in:
parent
6b5c4a3a1b
commit
7af1c6acbe
14
crawler.py
14
crawler.py
@ -11,7 +11,9 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import time
|
||||
|
||||
from mastodon import Mastodon
|
||||
@ -70,7 +72,8 @@ def setup_driver():
|
||||
chrome_options.add_argument(f'user-agent={user_agent}')
|
||||
chrome_options.add_argument('--window-size=1920,1080')
|
||||
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
return driver
|
||||
|
||||
def crawler():
|
||||
@ -81,9 +84,16 @@ def crawler():
|
||||
try:
|
||||
logger.info(f"Starting to crawl {home_url}")
|
||||
driver.get(home_url)
|
||||
|
||||
|
||||
# Wait for page to load
|
||||
time.sleep(3)
|
||||
|
||||
# Debug: log page info
|
||||
logger.info(f"Page title: {driver.title}")
|
||||
logger.info(f"Current URL: {driver.current_url}")
|
||||
page_source = driver.page_source
|
||||
logger.info(f"Page source length: {len(page_source)}")
|
||||
logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")
|
||||
|
||||
# Find all chiphell_box elements and get the last one
|
||||
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user