feat(crawler): add main category support for better classification
- Add CATEGORY_MAPPING dictionary to map sub-categories to main categories - Implement get_main_category function to find parent category - Include main_category field in article data structure - Update toot function to display both main and sub categories intelligently - Avoid duplication when main category is the same as sub category
This commit is contained in:
parent
3bbe483c64
commit
6d1fffb63d
37
crawler.py
37
crawler.py
@ -27,6 +27,30 @@ mastodon_client = Mastodon(
|
||||
api_base_url = 'https://nofan.xyz/'
|
||||
)
|
||||
|
||||
# 主分类和下级分类的映射关系
|
||||
CATEGORY_MAPPING = {
|
||||
'照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
|
||||
'电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
|
||||
'掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
|
||||
'摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
|
||||
'汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
|
||||
'单车': ['山地车', '公路车', '折叠车', '休旅车'],
|
||||
'模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
|
||||
'败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
|
||||
'时尚': ['鞋类', '服饰', '箱包'],
|
||||
'腕表': ['机械表', '电子表'],
|
||||
'视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
|
||||
'美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
|
||||
}
|
||||
|
||||
def get_main_category(sub_category):
|
||||
"""根据下级分类获取主分类"""
|
||||
for main_category, sub_categories in CATEGORY_MAPPING.items():
|
||||
if sub_category in sub_categories:
|
||||
return main_category
|
||||
# 如果不是下级分类或者是主分类本身,返回 None
|
||||
return None
|
||||
|
||||
|
||||
def save_to_redis(article):
|
||||
key = 'chh-article:%s' % article['article_id']
|
||||
@ -62,6 +86,8 @@ def crawler():
|
||||
author = entry.get('author', '未知')
|
||||
# 分类(第一个 tag)
|
||||
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
||||
# 获取主分类
|
||||
main_category = get_main_category(category)
|
||||
# 简介内容
|
||||
content = entry.summary
|
||||
content.removesuffix('...')
|
||||
@ -85,6 +111,7 @@ def crawler():
|
||||
'author': author,
|
||||
'date': date,
|
||||
'category': category,
|
||||
'main_category': main_category,
|
||||
'content': content,
|
||||
'url': url,
|
||||
'article_id': article_id
|
||||
@ -110,10 +137,16 @@ def toot():
|
||||
img = scraper.get(article['img_url'], timeout=10)
|
||||
# upload article image to mastodon
|
||||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||||
# 构建分类标签
|
||||
if article.get('main_category'):
|
||||
category_tags = f"#{article['main_category']} #{article['category']}"
|
||||
else:
|
||||
category_tags = f"#{article['category']}"
|
||||
|
||||
# toot the article info
|
||||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||||
toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||||
title=article['title'],
|
||||
category=article['category'],
|
||||
category_tags=category_tags,
|
||||
author=article['author'],
|
||||
content=article['content'],
|
||||
url=article['url'])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user