From 6b5c4a3a1bf05c7d49d32ba95c2363d1c49a302d Mon Sep 17 00:00:00 2001 From: Ching L Date: Wed, 3 Dec 2025 16:07:42 +0800 Subject: [PATCH] refactor(crawler): replace requests with selenium for web scraping - Replaced requests + BeautifulSoup with Selenium WebDriver - Added Chrome WebDriver with headless mode support - Updated HTML element extraction to use Selenium locators - Fixed logger path to use local directory for cross-platform compatibility - Added proper error handling for element extraction - Maintained compatibility with existing Redis and Mastodon functionality --- crawler.py | 187 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 129 insertions(+), 58 deletions(-) diff --git a/crawler.py b/crawler.py index 6899733..f341277 100644 --- a/crawler.py +++ b/crawler.py @@ -2,18 +2,27 @@ import requests -from bs4 import BeautifulSoup import re import redis import json from loguru import logger +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import TimeoutException, NoSuchElementException +import time from mastodon import Mastodon -# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) -# logger = logging.getLogger('/root/develop/log/chh-craler.log') -logger.add('/root/develop/log/chh-craler.log', level='INFO') +# Configure logger - use local path for macOS +import os +log_dir = './logs' +if not os.path.exists(log_dir): + os.makedirs(log_dir) +logger.add('./logs/chh-crawler.log', level='INFO') # connect to redis with password @@ -45,22 +54,53 @@ def url_shorten(url): else: return url +def setup_driver(): + """Configure and initialize Chrome WebDriver""" + chrome_options = Options() + chrome_options.add_argument('--headless') # Run in headless mode + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option('useAutomationExtension', False) + + # Set user agent + user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + chrome_options.add_argument(f'user-agent={user_agent}') + chrome_options.add_argument('--window-size=1920,1080') + + driver = webdriver.Chrome(options=chrome_options) + return driver + def crawler(): - # get article list in html div class name = "acon cl" + # Initialize Selenium WebDriver home_url = 'https://www.chiphell.com/' - # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} - # get the html page - r = requests.get(home_url, headers=headers) - # use BeautifulSoup to parse the html page - soup = BeautifulSoup(r.text, features="html.parser") - # find the div class name = "acon cl" in the last div name = "chiphell_box cl" - div = soup.find_all('div', class_='chiphell_box cl')[-1] - div = div.find('div', class_='acon cl') - # articles are in the ul div name = "threadulid" - ul = div.find('ul', id='threadulid') - # find all the li tags - li_list = ul.find_all('li') + driver = setup_driver() + + try: + logger.info(f"Starting to crawl {home_url}") + driver.get(home_url) + + # Wait for page to load + time.sleep(3) + + # Find all chiphell_box elements and get the last one + boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box') + if not boxes: + logger.error("No chiphell_box elements found") + return + + last_box = boxes[-1] + + # Find the acon div within the last box + acon_div = last_box.find_element(By.CLASS_NAME, 'acon') + + # Find the ul with id='threadulid' + ul = acon_div.find_element(By.ID, 'threadulid') + + # Find all li elements + li_list = ul.find_elements(By.TAG_NAME, 'li') # a list item is like: #
  • # @@ -87,48 +127,79 @@ def crawler(): # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ... # #
  • - # get the article img, title, author, date, category, content and url - for li in li_list: - # get the article img - img = li.find('img') - img_url = img['src'] - # get the article title - title = li.find('a', class_='tm03 cl') - title = title.text - # get the article author - author = li.find('a', class_='') - author = author.text - # get the article date - date = li.find('span', style='padding-left: 0px;') - date = date.text - # get the article category - category = li.find('a', class_='asort cl') - category = category.text - # get the article content - content = li.find('div', class_='tm04 cl') - content = content.text - # get the article url - url = li.find('a', class_='tm03 cl') - url = home_url + url['href'] - # get the article id - article_id = re.findall(r'article-(\d+)-1.html', url)[0] + # get the article img, title, author, date, category, content and url + for li in li_list: + try: + # get the article img + img = li.find_element(By.TAG_NAME, 'img') + img_url = img.get_attribute('src') + + # get the article title and URL + title_element = li.find_element(By.CLASS_NAME, 'tm03') + title = title_element.text + url = title_element.get_attribute('href') + if not url.startswith('http'): + url = home_url + url + + # get the article id + article_id_match = re.search(r'article-(\d+)-1\.html', url) + article_id = article_id_match.group(1) if article_id_match else None + if not article_id: + continue + + # get the article author + try: + avimain_div = li.find_element(By.CLASS_NAME, 'avimain') + author = avimain_div.find_element(By.TAG_NAME, 'a').text + except: + author = 'Unknown' + + # get the article date + try: + avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2') + date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]') + date = date_span.text + except: + date = 'Unknown' + + # get the article category + try: + category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort') + category = category_element.text + except: + category = 'Unknown' + + # get the article content + try: + content_element = li.find_element(By.CLASS_NAME, 'tm04') + content = content_element.text + except: + content = 'No preview available' + # make the article info a dict + article = { + 'img_url': img_url, + 'title': title, + 'author': author, + 'date': date, + 'category': category, + 'content': content, + 'url': url, + 'article_id': article_id + } - # make the article info a dict - article = { - 'img_url': img_url, - 'title': title, - 'author': author, - 'date': date, - 'category': category, - 'content': content, - 'url': url, - 'article_id': article_id - } - - # save the article info to redis - if save_to_redis(article): - print(article) + # save the article info to redis + if save_to_redis(article): + print(article) + + except Exception as e: + logger.error(f"Error processing article: {e}") + continue + + finally: + # Close the WebDriver + driver.quit() + logger.info("WebDriver closed") def toot(): # get all the keys in redis