From 29a8a08622d9e62dd5a78d81d9580c6a6fde77b5 Mon Sep 17 00:00:00 2001
From: Ching L <loooching@gmail.com>
Date: Mon, 7 Apr 2025 17:45:11 +0800
Subject: [PATCH] =?UTF-8?q?feat(crawler):=20=E4=BD=BF=E7=94=A8=20cloudscra?=
 =?UTF-8?q?per=20=E5=BA=93=E8=BF=9B=E8=A1=8C=E7=BD=91=E9=A1=B5=E7=88=AC?=
 =?UTF-8?q?=E5=8F=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

使用 cloudscraper 库替换 requests 库，以便在网页爬取过程中绕过 tencent 的防护机制。
---
 crawler.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/crawler.py b/crawler.py
index 6899733..75914eb 100644
--- a/crawler.py
+++ b/crawler.py
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
 import re
 import redis
 import json
+import cloudscraper
 from loguru import logger
 
 from mastodon import Mastodon
@@ -25,6 +26,7 @@ mastodon_client = Mastodon(
   api_base_url = 'https://nofan.xyz/'
 )
 
+scraper = cloudscraper.create_scraper()
 
 def save_to_redis(article):
   key = 'chh-article:%s' % article['article_id']
@@ -49,9 +51,10 @@ def crawler():
   # get article list in html div class name = "acon cl"
   home_url = 'https://www.chiphell.com/'
   # a normal chrome user agent
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
+  # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
   # get the html page
-  r = requests.get(home_url, headers=headers)
+  # r = requests.get(home_url, headers=headers)
+  r = scraper.get(home_url)
   # use BeautifulSoup to parse the html page
   soup = BeautifulSoup(r.text, features="html.parser")
   # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
@@ -128,7 +131,7 @@ def crawler():
 
     # save the article info to redis
     if save_to_redis(article):
-      print(article)
+      logger.info(article)
 
 def toot():
   # get all the keys in redis
@@ -142,9 +145,11 @@ def toot():
       continue
 
     # download article image to a temp file
-    img = requests.get(article['img_url'])
+    # img = requests.get(article['img_url'])
+    img = scraper.get(article['img_url'], timeout=10)
     # upload article image to mastodon
     media = mastodon_client.media_post(img.content, 'image/jpeg')
+    logger.info('Toot %s' % article['title'])
     # toot the article info
     toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
       title=article['title'],