refactor(crawler): replace requests with selenium for web scraping

- Replaced requests + BeautifulSoup with Selenium WebDriver - Added Chrome WebDriver with headless mode support - Updated HTML element extraction to use Selenium locators - Fixed logger path to use local directory for cross-platform compatibility - Added proper error handling for element extraction - Maintained compatibility with existing Redis and Mastodon functionality
2025-12-03 16:07:42 +08:00 · 2025-12-03 16:07:42 +08:00 · 6b5c4a3a1b
commit 6b5c4a3a1b
parent c5bf60858c
1 changed files with 129 additions and 58 deletions
--- a/crawler.py
+++ b/crawler.py
@ -2,18 +2,27 @@


 import requests
-from bs4 import BeautifulSoup
 import re
 import redis
 import json
 from loguru import logger
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+import time

 from mastodon import Mastodon


-# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
-# logger = logging.getLogger('/root/develop/log/chh-craler.log')
-logger.add('/root/develop/log/chh-craler.log', level='INFO')
+# Configure logger - use local path for macOS
+import os
+log_dir = './logs'
+if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+logger.add('./logs/chh-crawler.log', level='INFO')


 # connect to redis with password
@ -45,22 +54,53 @@ def url_shorten(url):
  else:
    return url

+def setup_driver():
+  """Configure and initialize Chrome WebDriver"""
+  chrome_options = Options()
+  chrome_options.add_argument('--headless')  # Run in headless mode
+  chrome_options.add_argument('--disable-gpu')
+  chrome_options.add_argument('--no-sandbox')
+  chrome_options.add_argument('--disable-dev-shm-usage')
+  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+  chrome_options.add_experimental_option('useAutomationExtension', False)
+  
+  # Set user agent
+  user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  chrome_options.add_argument(f'user-agent={user_agent}')
+  chrome_options.add_argument('--window-size=1920,1080')
+  
+  driver = webdriver.Chrome(options=chrome_options)
+  return driver
+
 def crawler():
-  # get article list in html div class name = "acon cl"
+  # Initialize Selenium WebDriver
  home_url = 'https://www.chiphell.com/'
-  # a normal chrome user agent
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
-  # get the html page
-  r = requests.get(home_url, headers=headers)
-  # use BeautifulSoup to parse the html page
-  soup = BeautifulSoup(r.text, features="html.parser")
-  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
-  div = soup.find_all('div', class_='chiphell_box cl')[-1]
-  div = div.find('div', class_='acon cl')
-  # articles are in the ul div name = "threadulid"
-  ul = div.find('ul', id='threadulid')
-  # find all the li tags
-  li_list = ul.find_all('li')
+  driver = setup_driver()
+  
+  try:
+    logger.info(f"Starting to crawl {home_url}")
+    driver.get(home_url)
+    
+    # Wait for page to load
+    time.sleep(3)
+    
+    # Find all chiphell_box elements and get the last one
+    boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
+    if not boxes:
+      logger.error("No chiphell_box elements found")
+      return
+    
+    last_box = boxes[-1]
+    
+    # Find the acon div within the last box
+    acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
+    
+    # Find the ul with id='threadulid'
+    ul = acon_div.find_element(By.ID, 'threadulid')
+    
+    # Find all li elements
+    li_list = ul.find_elements(By.TAG_NAME, 'li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
@ -89,30 +129,52 @@ def crawler():
  # </li>
    # get the article img, title, author, date, category, content and url
    for li in li_list:
+      try:
        # get the article img
-    img = li.find('img')
-    img_url = img['src']
-    # get the article title
-    title = li.find('a', class_='tm03 cl')
-    title = title.text
-    # get the article author
-    author = li.find('a', class_='')
-    author = author.text
-    # get the article date
-    date = li.find('span', style='padding-left: 0px;')
-    date = date.text
-    # get the article category
-    category = li.find('a', class_='asort cl')
-    category = category.text
-    # get the article content
-    content = li.find('div', class_='tm04 cl')
-    content = content.text
-    # get the article url
-    url = li.find('a', class_='tm03 cl')
-    url = home_url + url['href']
-    # get the article id
-    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
+        img = li.find_element(By.TAG_NAME, 'img')
+        img_url = img.get_attribute('src')
        
+        # get the article title and URL
+        title_element = li.find_element(By.CLASS_NAME, 'tm03')
+        title = title_element.text
+        url = title_element.get_attribute('href')
+        if not url.startswith('http'):
+          url = home_url + url
+        
+        # get the article id
+        article_id_match = re.search(r'article-(\d+)-1\.html', url)
+        article_id = article_id_match.group(1) if article_id_match else None
+        if not article_id:
+          continue
+        
+        # get the article author
+        try:
+          avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
+          author = avimain_div.find_element(By.TAG_NAME, 'a').text
+        except:
+          author = 'Unknown'
+        
+        # get the article date
+        try:
+          avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
+          date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
+          date = date_span.text
+        except:
+          date = 'Unknown'
+        
+        # get the article category
+        try:
+          category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
+          category = category_element.text
+        except:
+          category = 'Unknown'
+        
+        # get the article content
+        try:
+          content_element = li.find_element(By.CLASS_NAME, 'tm04')
+          content = content_element.text
+        except:
+          content = 'No preview available'

        # make the article info a dict
        article = {
@ -130,6 +192,15 @@ def crawler():
        if save_to_redis(article):
          print(article)
      
+      except Exception as e:
+        logger.error(f"Error processing article: {e}")
+        continue
+  
+  finally:
+    # Close the WebDriver
+    driver.quit()
+    logger.info("WebDriver closed")
+
 def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')