chh-crawler/crawler.py
Ching L 6d1fffb63d feat(crawler): add main category support for better classification
- Add CATEGORY_MAPPING dictionary to map sub-categories to main categories
- Implement get_main_category function to find parent category
- Include main_category field in article data structure
- Update toot function to display both main and sub categories intelligently
- Avoid duplication when main category is the same as sub category
2025-12-09 10:58:01 +08:00

165 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### a crawler for the website: https://www.chiphell.com/
import requests
from bs4 import BeautifulSoup
import re
import redis
import json
import feedparser
import cloudscraper
from loguru import logger
from mastodon import Mastodon
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
scraper = cloudscraper.create_scraper()
# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
port=6379, db=0)
mastodon_client = Mastodon(
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
api_base_url = 'https://nofan.xyz/'
)
# 主分类和下级分类的映射关系
CATEGORY_MAPPING = {
'照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
'电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
'掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
'摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
'汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
'单车': ['山地车', '公路车', '折叠车', '休旅车'],
'模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
'败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
'时尚': ['鞋类', '服饰', '箱包'],
'腕表': ['机械表', '电子表'],
'视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
'美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
}
def get_main_category(sub_category):
"""根据下级分类获取主分类"""
for main_category, sub_categories in CATEGORY_MAPPING.items():
if sub_category in sub_categories:
return main_category
# 如果不是下级分类或者是主分类本身,返回 None
return None
def save_to_redis(article):
key = 'chh-article:%s' % article['article_id']
if not redis_db.get(key):
article['url'] = url_shorten(article['url'])
redis_db.set(key, json.dumps(article), ex=3600*24*7)
return True
def url_shorten(url):
api_url = "https://s.tunpok.com/api/v2/links"
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
headers = {
'x-api-key': api_key,
}
resp = requests.post(api_url, headers=headers, json={"target": url})
if resp.status_code == 201:
return resp.json()['link']
else:
return url
def crawler():
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
feed = feedparser.parse(rss_url)
for entry in feed.entries:
# 标题
title = entry.title
# 链接
url = entry.link
# 发布时间
date = entry.published
# 作者
author = entry.get('author', '未知')
# 分类(第一个 tag
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
# 获取主分类
main_category = get_main_category(category)
# 简介内容
content = entry.summary
content.removesuffix('...')
# 图片链接(从 links 中的 enclosure 找 image/jpeg
img_url = ''
for link in entry.get('links', []):
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
img_url = link.get('href')
img_url = img_url.replace('https://www.chiphell.com/', '')
break
# 提取 article_id从链接中用正则
article_id_match = re.search(r'article-(\d+)-1\.html', url)
article_id = article_id_match.group(1) if article_id_match else 'unknown'
# 封装成字典
article = {
'img_url': img_url,
'title': title,
'author': author,
'date': date,
'category': category,
'main_category': main_category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis
if save_to_redis(article):
logger.info(article)
def toot():
# get all the keys in redis
keys = redis_db.keys('chh-article:*')
# get the article info from redis
for key in keys:
article = json.loads(redis_db.get(key))
# get send article id from redis set 'send-chh-article-id'
# if the article['id'] is in the set, skip it
if redis_db.sismember('send-chh-article-id', article['article_id']):
continue
# download article image to a temp file
#img = requests.get(article['img_url'])
img = scraper.get(article['img_url'], timeout=10)
# upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg')
# 构建分类标签
if article.get('main_category'):
category_tags = f"#{article['main_category']} #{article['category']}"
else:
category_tags = f"#{article['category']}"
# toot the article info
toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
title=article['title'],
category_tags=category_tags,
author=article['author'],
content=article['content'],
url=article['url'])
mastodon_client.status_post(toot_content, media_ids=[media['id']])
logger.info('Toot %s' % article['title'])
# add the article['id'] to the set
redis_db.sadd('send-chh-article-id', article['article_id'])
break
if __name__ == '__main__':
crawler()
toot()
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')