feat(crawler): add main category support for better classification
- Add CATEGORY_MAPPING dictionary to map sub-categories to main categories - Implement get_main_category function to find parent category - Include main_category field in article data structure - Update toot function to display both main and sub categories intelligently - Avoid duplication when main category is the same as sub category
This commit is contained in:
parent
3bbe483c64
commit
6d1fffb63d
37
crawler.py
37
crawler.py
@ -27,6 +27,30 @@ mastodon_client = Mastodon(
|
|||||||
api_base_url = 'https://nofan.xyz/'
|
api_base_url = 'https://nofan.xyz/'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 主分类和下级分类的映射关系
|
||||||
|
CATEGORY_MAPPING = {
|
||||||
|
'照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
|
||||||
|
'电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
|
||||||
|
'掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
|
||||||
|
'摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
|
||||||
|
'汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
|
||||||
|
'单车': ['山地车', '公路车', '折叠车', '休旅车'],
|
||||||
|
'模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
|
||||||
|
'败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
|
||||||
|
'时尚': ['鞋类', '服饰', '箱包'],
|
||||||
|
'腕表': ['机械表', '电子表'],
|
||||||
|
'视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
|
||||||
|
'美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_main_category(sub_category):
|
||||||
|
"""根据下级分类获取主分类"""
|
||||||
|
for main_category, sub_categories in CATEGORY_MAPPING.items():
|
||||||
|
if sub_category in sub_categories:
|
||||||
|
return main_category
|
||||||
|
# 如果不是下级分类或者是主分类本身,返回 None
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def save_to_redis(article):
|
def save_to_redis(article):
|
||||||
key = 'chh-article:%s' % article['article_id']
|
key = 'chh-article:%s' % article['article_id']
|
||||||
@ -62,6 +86,8 @@ def crawler():
|
|||||||
author = entry.get('author', '未知')
|
author = entry.get('author', '未知')
|
||||||
# 分类(第一个 tag)
|
# 分类(第一个 tag)
|
||||||
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
||||||
|
# 获取主分类
|
||||||
|
main_category = get_main_category(category)
|
||||||
# 简介内容
|
# 简介内容
|
||||||
content = entry.summary
|
content = entry.summary
|
||||||
content.removesuffix('...')
|
content.removesuffix('...')
|
||||||
@ -85,6 +111,7 @@ def crawler():
|
|||||||
'author': author,
|
'author': author,
|
||||||
'date': date,
|
'date': date,
|
||||||
'category': category,
|
'category': category,
|
||||||
|
'main_category': main_category,
|
||||||
'content': content,
|
'content': content,
|
||||||
'url': url,
|
'url': url,
|
||||||
'article_id': article_id
|
'article_id': article_id
|
||||||
@ -110,10 +137,16 @@ def toot():
|
|||||||
img = scraper.get(article['img_url'], timeout=10)
|
img = scraper.get(article['img_url'], timeout=10)
|
||||||
# upload article image to mastodon
|
# upload article image to mastodon
|
||||||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||||||
|
# 构建分类标签
|
||||||
|
if article.get('main_category'):
|
||||||
|
category_tags = f"#{article['main_category']} #{article['category']}"
|
||||||
|
else:
|
||||||
|
category_tags = f"#{article['category']}"
|
||||||
|
|
||||||
# toot the article info
|
# toot the article info
|
||||||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||||||
title=article['title'],
|
title=article['title'],
|
||||||
category=article['category'],
|
category_tags=category_tags,
|
||||||
author=article['author'],
|
author=article['author'],
|
||||||
content=article['content'],
|
content=article['content'],
|
||||||
url=article['url'])
|
url=article['url'])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user