From 3bbe483c646369caaa24d2a614ef83a716b980af Mon Sep 17 00:00:00 2001 From: Ching L Date: Fri, 5 Dec 2025 21:07:40 +0800 Subject: [PATCH] feat(crawler): add cloudscraper to bypass Cloudflare protection - Replace requests with cloudscraper for image downloading - Update log file path to use home directory logs - Add timeout parameter for image requests to prevent hanging --- crawler.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index 645074a..894c8fd 100644 --- a/crawler.py +++ b/crawler.py @@ -7,6 +7,7 @@ import re import redis import json import feedparser +import cloudscraper from loguru import logger from mastodon import Mastodon @@ -14,8 +15,8 @@ from mastodon import Mastodon # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # logger = logging.getLogger('/root/develop/log/chh-craler.log') -logger.add('/root/develop/log/chh-craler.log', level='INFO') - +logger.add('/home/ching/logs/chh-craler.log', level='INFO') +scraper = cloudscraper.create_scraper() # connect to redis with password redis_db = redis.StrictRedis(host="localhost", @@ -105,7 +106,8 @@ def toot(): continue # download article image to a temp file - img = requests.get(article['img_url']) + #img = requests.get(article['img_url']) + img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') # toot the article info