refactor(crawler): replace requests with selenium for web scraping

- Replaced requests + BeautifulSoup with Selenium WebDriver
  - Added Chrome WebDriver with headless mode support
  - Updated HTML element extraction to use Selenium locators
  - Fixed logger path to use local directory for cross-platform compatibility
  - Added proper error handling for element extraction
  - Maintained compatibility with existing Redis and Mastodon functionality
This commit is contained in:
Ching L 2025-12-03 16:07:42 +08:00
parent c5bf60858c
commit 6b5c4a3a1b

View File

@ -2,18 +2,27 @@
import requests
from bs4 import BeautifulSoup
import re
import redis
import json
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/root/develop/log/chh-craler.log', level='INFO')
# Configure logger - use local path for macOS
import os
log_dir = './logs'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
logger.add('./logs/chh-crawler.log', level='INFO')
# connect to redis with password
@ -45,22 +54,53 @@ def url_shorten(url):
else:
return url
def setup_driver():
"""Configure and initialize Chrome WebDriver"""
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# Set user agent
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=chrome_options)
return driver
def crawler():
# get article list in html div class name = "acon cl"
# Initialize Selenium WebDriver
home_url = 'https://www.chiphell.com/'
# a normal chrome user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page
r = requests.get(home_url, headers=headers)
# use BeautifulSoup to parse the html page
soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
div = soup.find_all('div', class_='chiphell_box cl')[-1]
div = div.find('div', class_='acon cl')
# articles are in the ul div name = "threadulid"
ul = div.find('ul', id='threadulid')
# find all the li tags
li_list = ul.find_all('li')
driver = setup_driver()
try:
logger.info(f"Starting to crawl {home_url}")
driver.get(home_url)
# Wait for page to load
time.sleep(3)
# Find all chiphell_box elements and get the last one
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
if not boxes:
logger.error("No chiphell_box elements found")
return
last_box = boxes[-1]
# Find the acon div within the last box
acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
# Find the ul with id='threadulid'
ul = acon_div.find_element(By.ID, 'threadulid')
# Find all li elements
li_list = ul.find_elements(By.TAG_NAME, 'li')
# a list item is like:
# <li>
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
@ -89,30 +129,52 @@ def crawler():
# </li>
# get the article img, title, author, date, category, content and url
for li in li_list:
try:
# get the article img
img = li.find('img')
img_url = img['src']
# get the article title
title = li.find('a', class_='tm03 cl')
title = title.text
# get the article author
author = li.find('a', class_='')
author = author.text
# get the article date
date = li.find('span', style='padding-left: 0px;')
date = date.text
# get the article category
category = li.find('a', class_='asort cl')
category = category.text
# get the article content
content = li.find('div', class_='tm04 cl')
content = content.text
# get the article url
url = li.find('a', class_='tm03 cl')
url = home_url + url['href']
# get the article id
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
img = li.find_element(By.TAG_NAME, 'img')
img_url = img.get_attribute('src')
# get the article title and URL
title_element = li.find_element(By.CLASS_NAME, 'tm03')
title = title_element.text
url = title_element.get_attribute('href')
if not url.startswith('http'):
url = home_url + url
# get the article id
article_id_match = re.search(r'article-(\d+)-1\.html', url)
article_id = article_id_match.group(1) if article_id_match else None
if not article_id:
continue
# get the article author
try:
avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
author = avimain_div.find_element(By.TAG_NAME, 'a').text
except:
author = 'Unknown'
# get the article date
try:
avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
date = date_span.text
except:
date = 'Unknown'
# get the article category
try:
category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
category = category_element.text
except:
category = 'Unknown'
# get the article content
try:
content_element = li.find_element(By.CLASS_NAME, 'tm04')
content = content_element.text
except:
content = 'No preview available'
# make the article info a dict
article = {
@ -130,6 +192,15 @@ def crawler():
if save_to_redis(article):
print(article)
except Exception as e:
logger.error(f"Error processing article: {e}")
continue
finally:
# Close the WebDriver
driver.quit()
logger.info("WebDriver closed")
def toot():
# get all the keys in redis
keys = redis_db.keys('chh-article:*')