refactor(crawler): replace requests with selenium for web scraping
- Replaced requests + BeautifulSoup with Selenium WebDriver - Added Chrome WebDriver with headless mode support - Updated HTML element extraction to use Selenium locators - Fixed logger path to use local directory for cross-platform compatibility - Added proper error handling for element extraction - Maintained compatibility with existing Redis and Mastodon functionality
This commit is contained in:
parent
c5bf60858c
commit
6b5c4a3a1b
185
crawler.py
185
crawler.py
@ -2,18 +2,27 @@
|
|||||||
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import re
|
import re
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||||
|
import time
|
||||||
|
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
|
|
||||||
|
|
||||||
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
|
# Configure logger - use local path for macOS
|
||||||
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
|
import os
|
||||||
logger.add('/root/develop/log/chh-craler.log', level='INFO')
|
log_dir = './logs'
|
||||||
|
if not os.path.exists(log_dir):
|
||||||
|
os.makedirs(log_dir)
|
||||||
|
logger.add('./logs/chh-crawler.log', level='INFO')
|
||||||
|
|
||||||
|
|
||||||
# connect to redis with password
|
# connect to redis with password
|
||||||
@ -45,22 +54,53 @@ def url_shorten(url):
|
|||||||
else:
|
else:
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def setup_driver():
|
||||||
|
"""Configure and initialize Chrome WebDriver"""
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument('--headless') # Run in headless mode
|
||||||
|
chrome_options.add_argument('--disable-gpu')
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||||
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||||
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
|
||||||
|
# Set user agent
|
||||||
|
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||||
|
chrome_options.add_argument(f'user-agent={user_agent}')
|
||||||
|
chrome_options.add_argument('--window-size=1920,1080')
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
return driver
|
||||||
|
|
||||||
def crawler():
|
def crawler():
|
||||||
# get article list in html div class name = "acon cl"
|
# Initialize Selenium WebDriver
|
||||||
home_url = 'https://www.chiphell.com/'
|
home_url = 'https://www.chiphell.com/'
|
||||||
# a normal chrome user agent
|
driver = setup_driver()
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
|
||||||
# get the html page
|
try:
|
||||||
r = requests.get(home_url, headers=headers)
|
logger.info(f"Starting to crawl {home_url}")
|
||||||
# use BeautifulSoup to parse the html page
|
driver.get(home_url)
|
||||||
soup = BeautifulSoup(r.text, features="html.parser")
|
|
||||||
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
# Wait for page to load
|
||||||
div = soup.find_all('div', class_='chiphell_box cl')[-1]
|
time.sleep(3)
|
||||||
div = div.find('div', class_='acon cl')
|
|
||||||
# articles are in the ul div name = "threadulid"
|
# Find all chiphell_box elements and get the last one
|
||||||
ul = div.find('ul', id='threadulid')
|
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
|
||||||
# find all the li tags
|
if not boxes:
|
||||||
li_list = ul.find_all('li')
|
logger.error("No chiphell_box elements found")
|
||||||
|
return
|
||||||
|
|
||||||
|
last_box = boxes[-1]
|
||||||
|
|
||||||
|
# Find the acon div within the last box
|
||||||
|
acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
|
||||||
|
|
||||||
|
# Find the ul with id='threadulid'
|
||||||
|
ul = acon_div.find_element(By.ID, 'threadulid')
|
||||||
|
|
||||||
|
# Find all li elements
|
||||||
|
li_list = ul.find_elements(By.TAG_NAME, 'li')
|
||||||
# a list item is like:
|
# a list item is like:
|
||||||
# <li>
|
# <li>
|
||||||
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
|
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
|
||||||
@ -87,48 +127,79 @@ def crawler():
|
|||||||
# 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...</div>
|
# 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...</div>
|
||||||
# </div>
|
# </div>
|
||||||
# </li>
|
# </li>
|
||||||
# get the article img, title, author, date, category, content and url
|
# get the article img, title, author, date, category, content and url
|
||||||
for li in li_list:
|
for li in li_list:
|
||||||
# get the article img
|
try:
|
||||||
img = li.find('img')
|
# get the article img
|
||||||
img_url = img['src']
|
img = li.find_element(By.TAG_NAME, 'img')
|
||||||
# get the article title
|
img_url = img.get_attribute('src')
|
||||||
title = li.find('a', class_='tm03 cl')
|
|
||||||
title = title.text
|
|
||||||
# get the article author
|
|
||||||
author = li.find('a', class_='')
|
|
||||||
author = author.text
|
|
||||||
# get the article date
|
|
||||||
date = li.find('span', style='padding-left: 0px;')
|
|
||||||
date = date.text
|
|
||||||
# get the article category
|
|
||||||
category = li.find('a', class_='asort cl')
|
|
||||||
category = category.text
|
|
||||||
# get the article content
|
|
||||||
content = li.find('div', class_='tm04 cl')
|
|
||||||
content = content.text
|
|
||||||
# get the article url
|
|
||||||
url = li.find('a', class_='tm03 cl')
|
|
||||||
url = home_url + url['href']
|
|
||||||
# get the article id
|
|
||||||
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
|
|
||||||
|
|
||||||
|
# get the article title and URL
|
||||||
|
title_element = li.find_element(By.CLASS_NAME, 'tm03')
|
||||||
|
title = title_element.text
|
||||||
|
url = title_element.get_attribute('href')
|
||||||
|
if not url.startswith('http'):
|
||||||
|
url = home_url + url
|
||||||
|
|
||||||
# make the article info a dict
|
# get the article id
|
||||||
article = {
|
article_id_match = re.search(r'article-(\d+)-1\.html', url)
|
||||||
'img_url': img_url,
|
article_id = article_id_match.group(1) if article_id_match else None
|
||||||
'title': title,
|
if not article_id:
|
||||||
'author': author,
|
continue
|
||||||
'date': date,
|
|
||||||
'category': category,
|
|
||||||
'content': content,
|
|
||||||
'url': url,
|
|
||||||
'article_id': article_id
|
|
||||||
}
|
|
||||||
|
|
||||||
# save the article info to redis
|
# get the article author
|
||||||
if save_to_redis(article):
|
try:
|
||||||
print(article)
|
avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
|
||||||
|
author = avimain_div.find_element(By.TAG_NAME, 'a').text
|
||||||
|
except:
|
||||||
|
author = 'Unknown'
|
||||||
|
|
||||||
|
# get the article date
|
||||||
|
try:
|
||||||
|
avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
|
||||||
|
date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
|
||||||
|
date = date_span.text
|
||||||
|
except:
|
||||||
|
date = 'Unknown'
|
||||||
|
|
||||||
|
# get the article category
|
||||||
|
try:
|
||||||
|
category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
|
||||||
|
category = category_element.text
|
||||||
|
except:
|
||||||
|
category = 'Unknown'
|
||||||
|
|
||||||
|
# get the article content
|
||||||
|
try:
|
||||||
|
content_element = li.find_element(By.CLASS_NAME, 'tm04')
|
||||||
|
content = content_element.text
|
||||||
|
except:
|
||||||
|
content = 'No preview available'
|
||||||
|
|
||||||
|
# make the article info a dict
|
||||||
|
article = {
|
||||||
|
'img_url': img_url,
|
||||||
|
'title': title,
|
||||||
|
'author': author,
|
||||||
|
'date': date,
|
||||||
|
'category': category,
|
||||||
|
'content': content,
|
||||||
|
'url': url,
|
||||||
|
'article_id': article_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# save the article info to redis
|
||||||
|
if save_to_redis(article):
|
||||||
|
print(article)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing article: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Close the WebDriver
|
||||||
|
driver.quit()
|
||||||
|
logger.info("WebDriver closed")
|
||||||
|
|
||||||
def toot():
|
def toot():
|
||||||
# get all the keys in redis
|
# get all the keys in redis
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user