feat(crawler): add cloudscraper to bypass Cloudflare protection

- Replace requests with cloudscraper for image downloading - Update log file path to use home directory logs - Add timeout parameter for image requests to prevent hanging
2025-12-05 21:07:40 +08:00 · 2025-12-05 21:07:40 +08:00 · 3bbe483c64
commit 3bbe483c64
parent da1969b103
1 changed files with 5 additions and 3 deletions
--- a/crawler.py
+++ b/crawler.py
@ -7,6 +7,7 @@ import re
 import redis
 import json
 import feedparser
+import cloudscraper
 from loguru import logger

 from mastodon import Mastodon
@ -14,8 +15,8 @@ from mastodon import Mastodon

 # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
 # logger = logging.getLogger('/root/develop/log/chh-craler.log')
-logger.add('/root/develop/log/chh-craler.log', level='INFO')
-
+logger.add('/home/ching/logs/chh-craler.log', level='INFO')
+scraper = cloudscraper.create_scraper()

 # connect to redis with password
 redis_db = redis.StrictRedis(host="localhost",
@ -105,7 +106,8 @@ def toot():
      continue

    # download article image to a temp file
-    img = requests.get(article['img_url'])
+    #img = requests.get(article['img_url'])
+    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
    # toot the article info