From 6b5c4a3a1bf05c7d49d32ba95c2363d1c49a302d Mon Sep 17 00:00:00 2001
From: Ching L <loooching@gmail.com>
Date: Wed, 3 Dec 2025 16:07:42 +0800
Subject: [PATCH]   refactor(crawler): replace requests with selenium for web
 scraping

  - Replaced requests + BeautifulSoup with Selenium WebDriver
  - Added Chrome WebDriver with headless mode support
  - Updated HTML element extraction to use Selenium locators
  - Fixed logger path to use local directory for cross-platform compatibility
  - Added proper error handling for element extraction
  - Maintained compatibility with existing Redis and Mastodon functionality
---
 crawler.py | 187 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 129 insertions(+), 58 deletions(-)

diff --git a/crawler.py b/crawler.py
index 6899733..f341277 100644
--- a/crawler.py
+++ b/crawler.py
@@ -2,18 +2,27 @@
 
 
 import requests
-from bs4 import BeautifulSoup
 import re
 import redis
 import json
 from loguru import logger
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+import time
 
 from mastodon import Mastodon
 
 
-# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
-# logger = logging.getLogger('/root/develop/log/chh-craler.log')
-logger.add('/root/develop/log/chh-craler.log', level='INFO')
+# Configure logger - use local path for macOS
+import os
+log_dir = './logs'
+if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+logger.add('./logs/chh-crawler.log', level='INFO')
 
 
 # connect to redis with password
@@ -45,22 +54,53 @@ def url_shorten(url):
   else:
     return url
 
+def setup_driver():
+  """Configure and initialize Chrome WebDriver"""
+  chrome_options = Options()
+  chrome_options.add_argument('--headless')  # Run in headless mode
+  chrome_options.add_argument('--disable-gpu')
+  chrome_options.add_argument('--no-sandbox')
+  chrome_options.add_argument('--disable-dev-shm-usage')
+  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
+  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+  chrome_options.add_experimental_option('useAutomationExtension', False)
+  
+  # Set user agent
+  user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+  chrome_options.add_argument(f'user-agent={user_agent}')
+  chrome_options.add_argument('--window-size=1920,1080')
+  
+  driver = webdriver.Chrome(options=chrome_options)
+  return driver
+
 def crawler():
-  # get article list in html div class name = "acon cl"
+  # Initialize Selenium WebDriver
   home_url = 'https://www.chiphell.com/'
-  # a normal chrome user agent
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
-  # get the html page
-  r = requests.get(home_url, headers=headers)
-  # use BeautifulSoup to parse the html page
-  soup = BeautifulSoup(r.text, features="html.parser")
-  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
-  div = soup.find_all('div', class_='chiphell_box cl')[-1]
-  div = div.find('div', class_='acon cl')
-  # articles are in the ul div name = "threadulid"
-  ul = div.find('ul', id='threadulid')
-  # find all the li tags
-  li_list = ul.find_all('li')
+  driver = setup_driver()
+  
+  try:
+    logger.info(f"Starting to crawl {home_url}")
+    driver.get(home_url)
+    
+    # Wait for page to load
+    time.sleep(3)
+    
+    # Find all chiphell_box elements and get the last one
+    boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
+    if not boxes:
+      logger.error("No chiphell_box elements found")
+      return
+    
+    last_box = boxes[-1]
+    
+    # Find the acon div within the last box
+    acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
+    
+    # Find the ul with id='threadulid'
+    ul = acon_div.find_element(By.ID, 'threadulid')
+    
+    # Find all li elements
+    li_list = ul.find_elements(By.TAG_NAME, 'li')
   # a list item is like:
   # <li>
   # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
@@ -87,48 +127,79 @@ def crawler():
   # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
   # </div>
   # </li>
-  # get the article img, title, author, date, category, content and url
-  for li in li_list:
-    # get the article img
-    img = li.find('img')
-    img_url = img['src']
-    # get the article title
-    title = li.find('a', class_='tm03 cl')
-    title = title.text
-    # get the article author
-    author = li.find('a', class_='')
-    author = author.text
-    # get the article date
-    date = li.find('span', style='padding-left: 0px;')
-    date = date.text
-    # get the article category
-    category = li.find('a', class_='asort cl')
-    category = category.text
-    # get the article content
-    content = li.find('div', class_='tm04 cl')
-    content = content.text
-    # get the article url
-    url = li.find('a', class_='tm03 cl')
-    url = home_url + url['href']
-    # get the article id
-    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
+    # get the article img, title, author, date, category, content and url
+    for li in li_list:
+      try:
+        # get the article img
+        img = li.find_element(By.TAG_NAME, 'img')
+        img_url = img.get_attribute('src')
+        
+        # get the article title and URL
+        title_element = li.find_element(By.CLASS_NAME, 'tm03')
+        title = title_element.text
+        url = title_element.get_attribute('href')
+        if not url.startswith('http'):
+          url = home_url + url
+        
+        # get the article id
+        article_id_match = re.search(r'article-(\d+)-1\.html', url)
+        article_id = article_id_match.group(1) if article_id_match else None
+        if not article_id:
+          continue
+        
+        # get the article author
+        try:
+          avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
+          author = avimain_div.find_element(By.TAG_NAME, 'a').text
+        except:
+          author = 'Unknown'
+        
+        # get the article date
+        try:
+          avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
+          date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
+          date = date_span.text
+        except:
+          date = 'Unknown'
+        
+        # get the article category
+        try:
+          category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
+          category = category_element.text
+        except:
+          category = 'Unknown'
+        
+        # get the article content
+        try:
+          content_element = li.find_element(By.CLASS_NAME, 'tm04')
+          content = content_element.text
+        except:
+          content = 'No preview available'
 
+        # make the article info a dict
+        article = {
+          'img_url': img_url,
+          'title': title,
+          'author': author,
+          'date': date,
+          'category': category,
+          'content': content,
+          'url': url,
+          'article_id': article_id
+        }
 
-    # make the article info a dict
-    article = {
-      'img_url': img_url,
-      'title': title,
-      'author': author,
-      'date': date,
-      'category': category,
-      'content': content,
-      'url': url,
-      'article_id': article_id
-    }
-
-    # save the article info to redis
-    if save_to_redis(article):
-      print(article)
+        # save the article info to redis
+        if save_to_redis(article):
+          print(article)
+      
+      except Exception as e:
+        logger.error(f"Error processing article: {e}")
+        continue
+  
+  finally:
+    # Close the WebDriver
+    driver.quit()
+    logger.info("WebDriver closed")
 
 def toot():
   # get all the keys in redis