refactor(crawler): replace requests with selenium for web scraping

- Replaced requests + BeautifulSoup with Selenium WebDriver
  - Added Chrome WebDriver with headless mode support
  - Updated HTML element extraction to use Selenium locators
  - Fixed logger path to use local directory for cross-platform compatibility
  - Added proper error handling for element extraction
  - Maintained compatibility with existing Redis and Mastodon functionality
This commit is contained in:
Ching L 2025-12-03 16:07:42 +08:00
parent c5bf60858c
commit 6b5c4a3a1b

View File

@ -2,18 +2,27 @@
import requests import requests
from bs4 import BeautifulSoup
import re import re
import redis import redis
import json import json
from loguru import logger from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from mastodon import Mastodon from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # Configure logger - use local path for macOS
# logger = logging.getLogger('/root/develop/log/chh-craler.log') import os
logger.add('/root/develop/log/chh-craler.log', level='INFO') log_dir = './logs'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
logger.add('./logs/chh-crawler.log', level='INFO')
# connect to redis with password # connect to redis with password
@ -45,22 +54,53 @@ def url_shorten(url):
else: else:
return url return url
def setup_driver():
"""Configure and initialize Chrome WebDriver"""
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# Set user agent
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--window-size=1920,1080')
driver = webdriver.Chrome(options=chrome_options)
return driver
def crawler(): def crawler():
# get article list in html div class name = "acon cl" # Initialize Selenium WebDriver
home_url = 'https://www.chiphell.com/' home_url = 'https://www.chiphell.com/'
# a normal chrome user agent driver = setup_driver()
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page try:
r = requests.get(home_url, headers=headers) logger.info(f"Starting to crawl {home_url}")
# use BeautifulSoup to parse the html page driver.get(home_url)
soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl" # Wait for page to load
div = soup.find_all('div', class_='chiphell_box cl')[-1] time.sleep(3)
div = div.find('div', class_='acon cl')
# articles are in the ul div name = "threadulid" # Find all chiphell_box elements and get the last one
ul = div.find('ul', id='threadulid') boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
# find all the li tags if not boxes:
li_list = ul.find_all('li') logger.error("No chiphell_box elements found")
return
last_box = boxes[-1]
# Find the acon div within the last box
acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
# Find the ul with id='threadulid'
ul = acon_div.find_element(By.ID, 'threadulid')
# Find all li elements
li_list = ul.find_elements(By.TAG_NAME, 'li')
# a list item is like: # a list item is like:
# <li> # <li>
# <a href="article-30010-1.html" target="_blank" class="tm01 cl"> # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
@ -87,48 +127,79 @@ def crawler():
# 女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...</div> # 女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...</div>
# </div> # </div>
# </li> # </li>
# get the article img, title, author, date, category, content and url # get the article img, title, author, date, category, content and url
for li in li_list: for li in li_list:
# get the article img try:
img = li.find('img') # get the article img
img_url = img['src'] img = li.find_element(By.TAG_NAME, 'img')
# get the article title img_url = img.get_attribute('src')
title = li.find('a', class_='tm03 cl')
title = title.text
# get the article author
author = li.find('a', class_='')
author = author.text
# get the article date
date = li.find('span', style='padding-left: 0px;')
date = date.text
# get the article category
category = li.find('a', class_='asort cl')
category = category.text
# get the article content
content = li.find('div', class_='tm04 cl')
content = content.text
# get the article url
url = li.find('a', class_='tm03 cl')
url = home_url + url['href']
# get the article id
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
# get the article title and URL
title_element = li.find_element(By.CLASS_NAME, 'tm03')
title = title_element.text
url = title_element.get_attribute('href')
if not url.startswith('http'):
url = home_url + url
# make the article info a dict # get the article id
article = { article_id_match = re.search(r'article-(\d+)-1\.html', url)
'img_url': img_url, article_id = article_id_match.group(1) if article_id_match else None
'title': title, if not article_id:
'author': author, continue
'date': date,
'category': category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis # get the article author
if save_to_redis(article): try:
print(article) avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
author = avimain_div.find_element(By.TAG_NAME, 'a').text
except:
author = 'Unknown'
# get the article date
try:
avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
date = date_span.text
except:
date = 'Unknown'
# get the article category
try:
category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
category = category_element.text
except:
category = 'Unknown'
# get the article content
try:
content_element = li.find_element(By.CLASS_NAME, 'tm04')
content = content_element.text
except:
content = 'No preview available'
# make the article info a dict
article = {
'img_url': img_url,
'title': title,
'author': author,
'date': date,
'category': category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis
if save_to_redis(article):
print(article)
except Exception as e:
logger.error(f"Error processing article: {e}")
continue
finally:
# Close the WebDriver
driver.quit()
logger.info("WebDriver closed")
def toot(): def toot():
# get all the keys in redis # get all the keys in redis