Updated the crawler to use the logger for outputting article information and toot notifications, enhancing the logging mechanism for improved monitoring and debugging.
130 lines
3.8 KiB
Python
130 lines
3.8 KiB
Python
### a crawler for the website: https://www.chiphell.com/
|
||
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import redis
|
||
import json
|
||
import feedparser
|
||
from loguru import logger
|
||
|
||
from mastodon import Mastodon
|
||
|
||
|
||
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
|
||
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
|
||
logger.add('/root/develop/log/chh-craler.log', level='INFO')
|
||
|
||
|
||
# connect to redis with password
|
||
redis_db = redis.StrictRedis(host="localhost",
|
||
port=6379, db=0)
|
||
|
||
mastodon_client = Mastodon(
|
||
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
|
||
api_base_url = 'https://nofan.xyz/'
|
||
)
|
||
|
||
|
||
def save_to_redis(article):
|
||
key = 'chh-article:%s' % article['article_id']
|
||
if not redis_db.get(key):
|
||
article['url'] = url_shorten(article['url'])
|
||
redis_db.set(key, json.dumps(article), ex=3600*24*7)
|
||
return True
|
||
|
||
def url_shorten(url):
|
||
api_url = "https://s.tunpok.com/api/v2/links"
|
||
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
|
||
headers = {
|
||
'x-api-key': api_key,
|
||
}
|
||
resp = requests.post(api_url, headers=headers, json={"target": url})
|
||
if resp.status_code == 201:
|
||
return resp.json()['link']
|
||
else:
|
||
return url
|
||
|
||
def crawler():
|
||
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
|
||
feed = feedparser.parse(rss_url)
|
||
|
||
for entry in feed.entries:
|
||
# 标题
|
||
title = entry.title
|
||
# 链接
|
||
url = entry.link
|
||
# 发布时间
|
||
date = entry.published
|
||
# 作者
|
||
author = entry.get('author', '未知')
|
||
# 分类(第一个 tag)
|
||
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
||
# 简介内容
|
||
content = entry.summary
|
||
content.removesuffix('...')
|
||
|
||
# 图片链接(从 links 中的 enclosure 找 image/jpeg)
|
||
img_url = ''
|
||
for link in entry.get('links', []):
|
||
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
|
||
img_url = link.get('href')
|
||
img_url = img_url.replace('https://www.chiphell.com/', '')
|
||
break
|
||
|
||
# 提取 article_id(从链接中用正则)
|
||
article_id_match = re.search(r'article-(\d+)-1\.html', url)
|
||
article_id = article_id_match.group(1) if article_id_match else 'unknown'
|
||
|
||
# 封装成字典
|
||
article = {
|
||
'img_url': img_url,
|
||
'title': title,
|
||
'author': author,
|
||
'date': date,
|
||
'category': category,
|
||
'content': content,
|
||
'url': url,
|
||
'article_id': article_id
|
||
}
|
||
|
||
# save the article info to redis
|
||
if save_to_redis(article):
|
||
logger.info(article)
|
||
|
||
def toot():
|
||
# get all the keys in redis
|
||
keys = redis_db.keys('chh-article:*')
|
||
# get the article info from redis
|
||
for key in keys:
|
||
article = json.loads(redis_db.get(key))
|
||
# get send article id from redis set 'send-chh-article-id'
|
||
# if the article['id'] is in the set, skip it
|
||
if redis_db.sismember('send-chh-article-id', article['article_id']):
|
||
continue
|
||
|
||
# download article image to a temp file
|
||
img = requests.get(article['img_url'])
|
||
# upload article image to mastodon
|
||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||
# toot the article info
|
||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||
title=article['title'],
|
||
category=article['category'],
|
||
author=article['author'],
|
||
content=article['content'],
|
||
url=article['url'])
|
||
|
||
mastodon_client.status_post(toot_content, media_ids=[media['id']])
|
||
logger.info('Toot %s' % article['title'])
|
||
|
||
# add the article['id'] to the set
|
||
redis_db.sadd('send-chh-article-id', article['article_id'])
|
||
break
|
||
|
||
if __name__ == '__main__':
|
||
crawler()
|
||
toot()
|
||
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')
|