From 29a8a08622d9e62dd5a78d81d9580c6a6fde77b5 Mon Sep 17 00:00:00 2001 From: Ching L Date: Mon, 7 Apr 2025 17:45:11 +0800 Subject: [PATCH] =?UTF-8?q?feat(crawler):=20=E4=BD=BF=E7=94=A8=20cloudscra?= =?UTF-8?q?per=20=E5=BA=93=E8=BF=9B=E8=A1=8C=E7=BD=91=E9=A1=B5=E7=88=AC?= =?UTF-8?q?=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 使用 cloudscraper 库替换 requests 库,以便在网页爬取过程中绕过 tencent 的防护机制。 --- crawler.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/crawler.py b/crawler.py index 6899733..75914eb 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup import re import redis import json +import cloudscraper from loguru import logger from mastodon import Mastodon @@ -25,6 +26,7 @@ mastodon_client = Mastodon( api_base_url = 'https://nofan.xyz/' ) +scraper = cloudscraper.create_scraper() def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] @@ -49,9 +51,10 @@ def crawler(): # get article list in html div class name = "acon cl" home_url = 'https://www.chiphell.com/' # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} + # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} # get the html page - r = requests.get(home_url, headers=headers) + # r = requests.get(home_url, headers=headers) + r = scraper.get(home_url) # use BeautifulSoup to parse the html page soup = BeautifulSoup(r.text, features="html.parser") # find the div class name = "acon cl" in the last div name = "chiphell_box cl" @@ -128,7 +131,7 @@ def crawler(): # save the article info to redis if save_to_redis(article): - print(article) + logger.info(article) def toot(): # get all the keys in redis @@ -142,9 +145,11 @@ def toot(): continue # download article image to a temp file - img = requests.get(article['img_url']) + # img = requests.get(article['img_url']) + img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') + logger.info('Toot %s' % article['title']) # toot the article info toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'],