feat(crawler): add cloudscraper to bypass Cloudflare protection

- Replace requests with cloudscraper for image downloading
- Update log file path to use home directory logs
- Add timeout parameter for image requests to prevent hanging
This commit is contained in:
Ching L 2025-12-05 21:07:40 +08:00
parent da1969b103
commit 3bbe483c64

View File

@ -7,6 +7,7 @@ import re
import redis
import json
import feedparser
import cloudscraper
from loguru import logger
from mastodon import Mastodon
@ -14,8 +15,8 @@ from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/root/develop/log/chh-craler.log', level='INFO')
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
scraper = cloudscraper.create_scraper()
# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
@ -105,7 +106,8 @@ def toot():
continue
# download article image to a temp file
img = requests.get(article['img_url'])
#img = requests.get(article['img_url'])
img = scraper.get(article['img_url'], timeout=10)
# upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg')
# toot the article info