- Replace requests with cloudscraper for image downloading - Update log file path to use home directory logs - Add timeout parameter for image requests to prevent hanging
132 lines
3.9 KiB
Python
132 lines
3.9 KiB
Python
### a crawler for the website: https://www.chiphell.com/
|
||
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import redis
|
||
import json
|
||
import feedparser
|
||
import cloudscraper
|
||
from loguru import logger
|
||
|
||
from mastodon import Mastodon
|
||
|
||
|
||
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
|
||
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
|
||
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
|
||
scraper = cloudscraper.create_scraper()
|
||
|
||
# connect to redis with password
|
||
redis_db = redis.StrictRedis(host="localhost",
|
||
port=6379, db=0)
|
||
|
||
mastodon_client = Mastodon(
|
||
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
|
||
api_base_url = 'https://nofan.xyz/'
|
||
)
|
||
|
||
|
||
def save_to_redis(article):
|
||
key = 'chh-article:%s' % article['article_id']
|
||
if not redis_db.get(key):
|
||
article['url'] = url_shorten(article['url'])
|
||
redis_db.set(key, json.dumps(article), ex=3600*24*7)
|
||
return True
|
||
|
||
def url_shorten(url):
|
||
api_url = "https://s.tunpok.com/api/v2/links"
|
||
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
|
||
headers = {
|
||
'x-api-key': api_key,
|
||
}
|
||
resp = requests.post(api_url, headers=headers, json={"target": url})
|
||
if resp.status_code == 201:
|
||
return resp.json()['link']
|
||
else:
|
||
return url
|
||
|
||
def crawler():
|
||
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
|
||
feed = feedparser.parse(rss_url)
|
||
|
||
for entry in feed.entries:
|
||
# 标题
|
||
title = entry.title
|
||
# 链接
|
||
url = entry.link
|
||
# 发布时间
|
||
date = entry.published
|
||
# 作者
|
||
author = entry.get('author', '未知')
|
||
# 分类(第一个 tag)
|
||
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
||
# 简介内容
|
||
content = entry.summary
|
||
content.removesuffix('...')
|
||
|
||
# 图片链接(从 links 中的 enclosure 找 image/jpeg)
|
||
img_url = ''
|
||
for link in entry.get('links', []):
|
||
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
|
||
img_url = link.get('href')
|
||
img_url = img_url.replace('https://www.chiphell.com/', '')
|
||
break
|
||
|
||
# 提取 article_id(从链接中用正则)
|
||
article_id_match = re.search(r'article-(\d+)-1\.html', url)
|
||
article_id = article_id_match.group(1) if article_id_match else 'unknown'
|
||
|
||
# 封装成字典
|
||
article = {
|
||
'img_url': img_url,
|
||
'title': title,
|
||
'author': author,
|
||
'date': date,
|
||
'category': category,
|
||
'content': content,
|
||
'url': url,
|
||
'article_id': article_id
|
||
}
|
||
|
||
# save the article info to redis
|
||
if save_to_redis(article):
|
||
logger.info(article)
|
||
|
||
def toot():
|
||
# get all the keys in redis
|
||
keys = redis_db.keys('chh-article:*')
|
||
# get the article info from redis
|
||
for key in keys:
|
||
article = json.loads(redis_db.get(key))
|
||
# get send article id from redis set 'send-chh-article-id'
|
||
# if the article['id'] is in the set, skip it
|
||
if redis_db.sismember('send-chh-article-id', article['article_id']):
|
||
continue
|
||
|
||
# download article image to a temp file
|
||
#img = requests.get(article['img_url'])
|
||
img = scraper.get(article['img_url'], timeout=10)
|
||
# upload article image to mastodon
|
||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||
# toot the article info
|
||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||
title=article['title'],
|
||
category=article['category'],
|
||
author=article['author'],
|
||
content=article['content'],
|
||
url=article['url'])
|
||
|
||
mastodon_client.status_post(toot_content, media_ids=[media['id']])
|
||
logger.info('Toot %s' % article['title'])
|
||
|
||
# add the article['id'] to the set
|
||
redis_db.sadd('send-chh-article-id', article['article_id'])
|
||
break
|
||
|
||
if __name__ == '__main__':
|
||
crawler()
|
||
toot()
|
||
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')
|