chh-crawler/crawler.py
Ching L 15addaba24 feat(crawler): update crawler to use RSS feed for article retrieval
Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source.
2025-04-07 11:32:00 +08:00

129 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### a crawler for the website: https://www.chiphell.com/
import requests
from bs4 import BeautifulSoup
import re
import redis
import json
import feedparser
from loguru import logger
from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/root/develop/log/chh-craler.log', level='INFO')
# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
port=6379, db=0)
mastodon_client = Mastodon(
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
api_base_url = 'https://nofan.xyz/'
)
def save_to_redis(article):
key = 'chh-article:%s' % article['article_id']
if not redis_db.get(key):
article['url'] = url_shorten(article['url'])
redis_db.set(key, json.dumps(article), ex=3600*24*7)
return True
def url_shorten(url):
api_url = "https://s.tunpok.com/api/v2/links"
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
headers = {
'x-api-key': api_key,
}
resp = requests.post(api_url, headers=headers, json={"target": url})
if resp.status_code == 201:
return resp.json()['link']
else:
return url
def crawler():
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
feed = feedparser.parse(rss_url)
for entry in feed.entries:
# 标题
title = entry.title
# 链接
url = entry.link
# 发布时间
date = entry.published
# 作者
author = entry.get('author', '未知')
# 分类(第一个 tag
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
# 简介内容
content = entry.summary
content.removesuffix('...')
# 图片链接(从 links 中的 enclosure 找 image/jpeg
img_url = ''
for link in entry.get('links', []):
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
img_url = link.get('href')
img_url = img_url.replace('https://www.chiphell.com/', '')
break
# 提取 article_id从链接中用正则
article_id_match = re.search(r'article-(\d+)-1\.html', url)
article_id = article_id_match.group(1) if article_id_match else 'unknown'
# 封装成字典
article = {
'img_url': img_url,
'title': title,
'author': author,
'date': date,
'category': category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis
if save_to_redis(article):
print(article)
def toot():
# get all the keys in redis
keys = redis_db.keys('chh-article:*')
# get the article info from redis
for key in keys:
article = json.loads(redis_db.get(key))
# get send article id from redis set 'send-chh-article-id'
# if the article['id'] is in the set, skip it
if redis_db.sismember('send-chh-article-id', article['article_id']):
continue
# download article image to a temp file
img = requests.get(article['img_url'])
# upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg')
# toot the article info
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
title=article['title'],
category=article['category'],
author=article['author'],
content=article['content'],
url=article['url'])
mastodon_client.status_post(toot_content, media_ids=[media['id']])
# add the article['id'] to the set
redis_db.sadd('send-chh-article-id', article['article_id'])
break
if __name__ == '__main__':
crawler()
toot()
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')