### a crawler for the website: https://www.chiphell.com/ import requests from bs4 import BeautifulSoup import re import redis import json import feedparser import cloudscraper from loguru import logger from mastodon import Mastodon # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # logger = logging.getLogger('/root/develop/log/chh-craler.log') logger.add('/home/ching/logs/chh-craler.log', level='INFO') scraper = cloudscraper.create_scraper() # connect to redis with password redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) mastodon_client = Mastodon( access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk', api_base_url = 'https://nofan.xyz/' ) def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] if not redis_db.get(key): article['url'] = url_shorten(article['url']) redis_db.set(key, json.dumps(article), ex=3600*24*7) return True def url_shorten(url): api_url = "https://s.tunpok.com/api/v2/links" api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6" headers = { 'x-api-key': api_key, } resp = requests.post(api_url, headers=headers, json={"target": url}) if resp.status_code == 201: return resp.json()['link'] else: return url def crawler(): rss_url = 'https://www.chiphell.com/portal.php?mod=rss' feed = feedparser.parse(rss_url) for entry in feed.entries: # 标题 title = entry.title # 链接 url = entry.link # 发布时间 date = entry.published # 作者 author = entry.get('author', '未知') # 分类(第一个 tag) category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类' # 简介内容 content = entry.summary content.removesuffix('...') # 图片链接(从 links 中的 enclosure 找 image/jpeg) img_url = '' for link in entry.get('links', []): if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure': img_url = link.get('href') img_url = img_url.replace('https://www.chiphell.com/', '') break # 提取 article_id(从链接中用正则) article_id_match = re.search(r'article-(\d+)-1\.html', url) article_id = article_id_match.group(1) if article_id_match else 'unknown' # 封装成字典 article = { 'img_url': img_url, 'title': title, 'author': author, 'date': date, 'category': category, 'content': content, 'url': url, 'article_id': article_id } # save the article info to redis if save_to_redis(article): logger.info(article) def toot(): # get all the keys in redis keys = redis_db.keys('chh-article:*') # get the article info from redis for key in keys: article = json.loads(redis_db.get(key)) # get send article id from redis set 'send-chh-article-id' # if the article['id'] is in the set, skip it if redis_db.sismember('send-chh-article-id', article['article_id']): continue # download article image to a temp file #img = requests.get(article['img_url']) img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') # toot the article info toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'], category=article['category'], author=article['author'], content=article['content'], url=article['url']) mastodon_client.status_post(toot_content, media_ids=[media['id']]) logger.info('Toot %s' % article['title']) # add the article['id'] to the set redis_db.sadd('send-chh-article-id', article['article_id']) break if __name__ == '__main__': crawler() toot() requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')