refactor(crawler): replace requests with selenium for web scraping

- Replaced requests + BeautifulSoup with Selenium WebDriver - Added Chrome WebDriver with headless mode support - Updated HTML element extraction to use Selenium locators - Fixed logger path to use local directory for cross-platform compatibility - Added proper error handling for element extraction - Maintained compatibility with existing Redis and Mastodon functionality
2025-12-03 16:07:42 +08:00 · 2025-12-03 16:07:42 +08:00 · 6b5c4a3a1b
commit 6b5c4a3a1b
parent c5bf60858c
1 changed files with 129 additions and 58 deletions
--- a/crawler.py
+++ b/crawler.py
@ -2,18 +2,27 @@
 import requests
 from bs4 import BeautifulSoup
 import re
 import redis
 import json
 from loguru import logger
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import TimeoutException, NoSuchElementException
 import time
 from mastodon import Mastodon
-# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
+# Configure logger - use local path for macOS
-# logger = logging.getLogger('/root/develop/log/chh-craler.log')
+import os
-logger.add('/root/develop/log/chh-craler.log', level='INFO')
+log_dir = './logs'
 if not os.path.exists(log_dir):
    os.makedirs(log_dir)
 logger.add('./logs/chh-crawler.log', level='INFO')
 # connect to redis with password
@ -45,22 +54,53 @@ def url_shorten(url):
  else:
    return url
 def setup_driver():
  """Configure and initialize Chrome WebDriver"""
  chrome_options = Options()
  chrome_options.add_argument('--headless')  # Run in headless mode
  chrome_options.add_argument('--disable-gpu')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  chrome_options.add_experimental_option('useAutomationExtension', False)
  # Set user agent
  user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
  chrome_options.add_argument(f'user-agent={user_agent}')
  chrome_options.add_argument('--window-size=1920,1080')
  driver = webdriver.Chrome(options=chrome_options)
  return driver
 def crawler():
-  # get article list in html div class name = "acon cl"
+  # Initialize Selenium WebDriver
  home_url = 'https://www.chiphell.com/'
-  # a normal chrome user agent
+  driver = setup_driver()
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
+  
-  # get the html page
+  try:
-  r = requests.get(home_url, headers=headers)
+    logger.info(f"Starting to crawl {home_url}")
-  # use BeautifulSoup to parse the html page
+    driver.get(home_url)
-  soup = BeautifulSoup(r.text, features="html.parser")
+    
-  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
+    # Wait for page to load
-  div = soup.find_all('div', class_='chiphell_box cl')[-1]
+    time.sleep(3)
-  div = div.find('div', class_='acon cl')
+    
-  # articles are in the ul div name = "threadulid"
+    # Find all chiphell_box elements and get the last one
-  ul = div.find('ul', id='threadulid')
+    boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
-  # find all the li tags
+    if not boxes:
-  li_list = ul.find_all('li')
+      logger.error("No chiphell_box elements found")
      return
    last_box = boxes[-1]
    # Find the acon div within the last box
    acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
    # Find the ul with id='threadulid'
    ul = acon_div.find_element(By.ID, 'threadulid')
    # Find all li elements
    li_list = ul.find_elements(By.TAG_NAME, 'li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
@ -87,48 +127,79 @@ def crawler():
  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
  # </div>
  # </li>
-  # get the article img, title, author, date, category, content and url
+    # get the article img, title, author, date, category, content and url
-  for li in li_list:
+    for li in li_list:
-    # get the article img
+      try:
-    img = li.find('img')
+        # get the article img
-    img_url = img['src']
+        img = li.find_element(By.TAG_NAME, 'img')
-    # get the article title
+        img_url = img.get_attribute('src')
    title = li.find('a', class_='tm03 cl')
    title = title.text
    # get the article author
    author = li.find('a', class_='')
    author = author.text
    # get the article date
    date = li.find('span', style='padding-left: 0px;')
    date = date.text
    # get the article category
    category = li.find('a', class_='asort cl')
    category = category.text
    # get the article content
    content = li.find('div', class_='tm04 cl')
    content = content.text
    # get the article url
    url = li.find('a', class_='tm03 cl')
    url = home_url + url['href']
    # get the article id
    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
        # get the article title and URL
        title_element = li.find_element(By.CLASS_NAME, 'tm03')
        title = title_element.text
        url = title_element.get_attribute('href')
        if not url.startswith('http'):
          url = home_url + url
-    # make the article info a dict
+        # get the article id
-    article = {
+        article_id_match = re.search(r'article-(\d+)-1\.html', url)
-      'img_url': img_url,
+        article_id = article_id_match.group(1) if article_id_match else None
-      'title': title,
+        if not article_id:
-      'author': author,
+          continue
      'date': date,
      'category': category,
      'content': content,
      'url': url,
      'article_id': article_id
    }
-    # save the article info to redis
+        # get the article author
-    if save_to_redis(article):
+        try:
-      print(article)
+          avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
          author = avimain_div.find_element(By.TAG_NAME, 'a').text
        except:
          author = 'Unknown'
        # get the article date
        try:
          avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
          date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
          date = date_span.text
        except:
          date = 'Unknown'
        # get the article category
        try:
          category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
          category = category_element.text
        except:
          category = 'Unknown'
        # get the article content
        try:
          content_element = li.find_element(By.CLASS_NAME, 'tm04')
          content = content_element.text
        except:
          content = 'No preview available'
        # make the article info a dict
        article = {
          'img_url': img_url,
          'title': title,
          'author': author,
          'date': date,
          'category': category,
          'content': content,
          'url': url,
          'article_id': article_id
        }
        # save the article info to redis
        if save_to_redis(article):
          print(article)
      except Exception as e:
        logger.error(f"Error processing article: {e}")
        continue
  finally:
    # Close the WebDriver
    driver.quit()
    logger.info("WebDriver closed")
 def toot():
  # get all the keys in redis