refactor(crawler): use webdriver-manager and add debug logging
This commit is contained in:
parent
6b5c4a3a1b
commit
7af1c6acbe
14
crawler.py
14
crawler.py
@ -11,7 +11,9 @@ from selenium.webdriver.common.by import By
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
@ -70,7 +72,8 @@ def setup_driver():
|
|||||||
chrome_options.add_argument(f'user-agent={user_agent}')
|
chrome_options.add_argument(f'user-agent={user_agent}')
|
||||||
chrome_options.add_argument('--window-size=1920,1080')
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
service = Service(ChromeDriverManager().install())
|
||||||
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def crawler():
|
def crawler():
|
||||||
@ -81,9 +84,16 @@ def crawler():
|
|||||||
try:
|
try:
|
||||||
logger.info(f"Starting to crawl {home_url}")
|
logger.info(f"Starting to crawl {home_url}")
|
||||||
driver.get(home_url)
|
driver.get(home_url)
|
||||||
|
|
||||||
# Wait for page to load
|
# Wait for page to load
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Debug: log page info
|
||||||
|
logger.info(f"Page title: {driver.title}")
|
||||||
|
logger.info(f"Current URL: {driver.current_url}")
|
||||||
|
page_source = driver.page_source
|
||||||
|
logger.info(f"Page source length: {len(page_source)}")
|
||||||
|
logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")
|
||||||
|
|
||||||
# Find all chiphell_box elements and get the last one
|
# Find all chiphell_box elements and get the last one
|
||||||
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
|
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user