feat(crawler): 使用 cloudscraper 库进行网页爬取

使用 cloudscraper 库替换 requests 库,以便在网页爬取过程中绕过 tencent 的防护机制。
This commit is contained in:
Ching L 2025-04-07 17:45:11 +08:00
parent c5bf60858c
commit 29a8a08622

View File

@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
import re import re
import redis import redis
import json import json
import cloudscraper
from loguru import logger from loguru import logger
from mastodon import Mastodon from mastodon import Mastodon
@ -25,6 +26,7 @@ mastodon_client = Mastodon(
api_base_url = 'https://nofan.xyz/' api_base_url = 'https://nofan.xyz/'
) )
scraper = cloudscraper.create_scraper()
def save_to_redis(article): def save_to_redis(article):
key = 'chh-article:%s' % article['article_id'] key = 'chh-article:%s' % article['article_id']
@ -49,9 +51,10 @@ def crawler():
# get article list in html div class name = "acon cl" # get article list in html div class name = "acon cl"
home_url = 'https://www.chiphell.com/' home_url = 'https://www.chiphell.com/'
# a normal chrome user agent # a normal chrome user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page # get the html page
r = requests.get(home_url, headers=headers) # r = requests.get(home_url, headers=headers)
r = scraper.get(home_url)
# use BeautifulSoup to parse the html page # use BeautifulSoup to parse the html page
soup = BeautifulSoup(r.text, features="html.parser") soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl" # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
@ -128,7 +131,7 @@ def crawler():
# save the article info to redis # save the article info to redis
if save_to_redis(article): if save_to_redis(article):
print(article) logger.info(article)
def toot(): def toot():
# get all the keys in redis # get all the keys in redis
@ -142,9 +145,11 @@ def toot():
continue continue
# download article image to a temp file # download article image to a temp file
img = requests.get(article['img_url']) # img = requests.get(article['img_url'])
img = scraper.get(article['img_url'], timeout=10)
# upload article image to mastodon # upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg') media = mastodon_client.media_post(img.content, 'image/jpeg')
logger.info('Toot %s' % article['title'])
# toot the article info # toot the article info
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
title=article['title'], title=article['title'],