### a crawler for the website: https://www.chiphell.com/ import requests from bs4 import BeautifulSoup import re import redis import json import feedparser import cloudscraper from loguru import logger from mastodon import Mastodon # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # logger = logging.getLogger('/root/develop/log/chh-craler.log') logger.add('/home/ching/logs/chh-craler.log', level='INFO') scraper = cloudscraper.create_scraper() # connect to redis with password redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) mastodon_client = Mastodon( access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk', api_base_url = 'https://nofan.xyz/' ) # 主分类和下级分类的映射关系 CATEGORY_MAPPING = { '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'], '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'], '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'], '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'], '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'], '单车': ['山地车', '公路车', '折叠车', '休旅车'], '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'], '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'], '时尚': ['鞋类', '服饰', '箱包'], '腕表': ['机械表', '电子表'], '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'], '美食': ['当地美食', '世界美食', '私房菜品', '美食器材'] } def get_main_category(sub_category): """根据下级分类获取主分类""" for main_category, sub_categories in CATEGORY_MAPPING.items(): if sub_category in sub_categories: return main_category # 如果不是下级分类或者是主分类本身,返回 None return None def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] if not redis_db.get(key): article['url'] = url_shorten(article['url']) redis_db.set(key, json.dumps(article), ex=3600*24*7) return True def url_shorten(url): api_url = "https://s.tunpok.com/api/v2/links" api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6" headers = { 'x-api-key': api_key, } resp = requests.post(api_url, headers=headers, json={"target": url}) if resp.status_code == 201: return resp.json()['link'] else: return url def crawler(): rss_url = 'https://www.chiphell.com/portal.php?mod=rss' feed = feedparser.parse(rss_url) for entry in feed.entries: # 标题 title = entry.title # 链接 url = entry.link # 发布时间 date = entry.published # 作者 author = entry.get('author', '未知') # 分类(第一个 tag) category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类' # 获取主分类 main_category = get_main_category(category) # 简介内容 content = entry.summary content.removesuffix('...') # 图片链接(从 links 中的 enclosure 找 image/jpeg) img_url = '' for link in entry.get('links', []): if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure': img_url = link.get('href') img_url = img_url.replace('https://www.chiphell.com/', '') break # 提取 article_id(从链接中用正则) article_id_match = re.search(r'article-(\d+)-1\.html', url) article_id = article_id_match.group(1) if article_id_match else 'unknown' # 封装成字典 article = { 'img_url': img_url, 'title': title, 'author': author, 'date': date, 'category': category, 'main_category': main_category, 'content': content, 'url': url, 'article_id': article_id } # save the article info to redis if save_to_redis(article): logger.info(article) def toot(): # get all the keys in redis keys = redis_db.keys('chh-article:*') # get the article info from redis for key in keys: article = json.loads(redis_db.get(key)) # get send article id from redis set 'send-chh-article-id' # if the article['id'] is in the set, skip it if redis_db.sismember('send-chh-article-id', article['article_id']): continue # download article image to a temp file #img = requests.get(article['img_url']) img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') # 构建分类标签 if article.get('main_category'): category_tags = f"#{article['main_category']} #{article['category']}" else: category_tags = f"#{article['category']}" # toot the article info toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'], category_tags=category_tags, author=article['author'], content=article['content'], url=article['url']) mastodon_client.status_post(toot_content, media_ids=[media['id']]) logger.info('Toot %s' % article['title']) # add the article['id'] to the set redis_db.sadd('send-chh-article-id', article['article_id']) break if __name__ == '__main__': crawler() toot() requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')