feat(crawler): 使用 cloudscraper 库进行网页爬取

使用 cloudscraper 库替换 requests 库，以便在网页爬取过程中绕过 tencent 的防护机制。
2025-04-07 17:45:11 +08:00 · 2025-04-07 17:45:11 +08:00 · 29a8a08622
commit 29a8a08622
parent c5bf60858c
1 changed files with 9 additions and 4 deletions
--- a/crawler.py
+++ b/crawler.py
@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
 import re
 import redis
 import json
+import cloudscraper
 from loguru import logger

 from mastodon import Mastodon
@ -25,6 +26,7 @@ mastodon_client = Mastodon(
  api_base_url = 'https://nofan.xyz/'
 )

+scraper = cloudscraper.create_scraper()

 def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
@ -49,9 +51,10 @@ def crawler():
  # get article list in html div class name = "acon cl"
  home_url = 'https://www.chiphell.com/'
  # a normal chrome user agent
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
+  # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
  # get the html page
-  r = requests.get(home_url, headers=headers)
+  # r = requests.get(home_url, headers=headers)
+  r = scraper.get(home_url)
  # use BeautifulSoup to parse the html page
  soup = BeautifulSoup(r.text, features="html.parser")
  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
@ -128,7 +131,7 @@ def crawler():

    # save the article info to redis
    if save_to_redis(article):
-      print(article)
+      logger.info(article)

 def toot():
  # get all the keys in redis
@ -142,9 +145,11 @@ def toot():
      continue

    # download article image to a temp file
-    img = requests.get(article['img_url'])
+    # img = requests.get(article['img_url'])
+    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
+    logger.info('Toot %s' % article['title'])
    # toot the article info
    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],