feat(crawler): 使用 cloudscraper 库进行网页爬取

使用 cloudscraper 库替换 requests 库,以便在网页爬取过程中绕过 tencent 的防护机制。
This commit is contained in:
Ching L 2025-04-07 17:45:11 +08:00
parent c5bf60858c
commit 29a8a08622

View File

@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
import re
import redis
import json
import cloudscraper
from loguru import logger
from mastodon import Mastodon
@ -25,6 +26,7 @@ mastodon_client = Mastodon(
api_base_url = 'https://nofan.xyz/'
)
scraper = cloudscraper.create_scraper()
def save_to_redis(article):
key = 'chh-article:%s' % article['article_id']
@ -49,9 +51,10 @@ def crawler():
# get article list in html div class name = "acon cl"
home_url = 'https://www.chiphell.com/'
# a normal chrome user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page
r = requests.get(home_url, headers=headers)
# r = requests.get(home_url, headers=headers)
r = scraper.get(home_url)
# use BeautifulSoup to parse the html page
soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
@ -128,7 +131,7 @@ def crawler():
# save the article info to redis
if save_to_redis(article):
print(article)
logger.info(article)
def toot():
# get all the keys in redis
@ -142,9 +145,11 @@ def toot():
continue
# download article image to a temp file
img = requests.get(article['img_url'])
# img = requests.get(article['img_url'])
img = scraper.get(article['img_url'], timeout=10)
# upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg')
logger.info('Toot %s' % article['title'])
# toot the article info
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
title=article['title'],