feat(crawler): add cloudscraper to bypass Cloudflare protection

- Replace requests with cloudscraper for image downloading
- Update log file path to use home directory logs
- Add timeout parameter for image requests to prevent hanging
This commit is contained in:
Ching L 2025-12-05 21:07:40 +08:00
parent da1969b103
commit 3bbe483c64

View File

@ -7,6 +7,7 @@ import re
import redis import redis
import json import json
import feedparser import feedparser
import cloudscraper
from loguru import logger from loguru import logger
from mastodon import Mastodon from mastodon import Mastodon
@ -14,8 +15,8 @@ from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log') # logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/root/develop/log/chh-craler.log', level='INFO') logger.add('/home/ching/logs/chh-craler.log', level='INFO')
scraper = cloudscraper.create_scraper()
# connect to redis with password # connect to redis with password
redis_db = redis.StrictRedis(host="localhost", redis_db = redis.StrictRedis(host="localhost",
@ -105,7 +106,8 @@ def toot():
continue continue
# download article image to a temp file # download article image to a temp file
img = requests.get(article['img_url']) #img = requests.get(article['img_url'])
img = scraper.get(article['img_url'], timeout=10)
# upload article image to mastodon # upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg') media = mastodon_client.media_post(img.content, 'image/jpeg')
# toot the article info # toot the article info