feat(crawler): add cloudscraper to bypass Cloudflare protection
- Replace requests with cloudscraper for image downloading - Update log file path to use home directory logs - Add timeout parameter for image requests to prevent hanging
This commit is contained in:
parent
da1969b103
commit
3bbe483c64
@ -7,6 +7,7 @@ import re
|
||||
import redis
|
||||
import json
|
||||
import feedparser
|
||||
import cloudscraper
|
||||
from loguru import logger
|
||||
|
||||
from mastodon import Mastodon
|
||||
@ -14,8 +15,8 @@ from mastodon import Mastodon
|
||||
|
||||
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
|
||||
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
|
||||
logger.add('/root/develop/log/chh-craler.log', level='INFO')
|
||||
|
||||
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
# connect to redis with password
|
||||
redis_db = redis.StrictRedis(host="localhost",
|
||||
@ -105,7 +106,8 @@ def toot():
|
||||
continue
|
||||
|
||||
# download article image to a temp file
|
||||
img = requests.get(article['img_url'])
|
||||
#img = requests.get(article['img_url'])
|
||||
img = scraper.get(article['img_url'], timeout=10)
|
||||
# upload article image to mastodon
|
||||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||||
# toot the article info
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user