From 6d1fffb63db6f86711e55b364cc4bcf5152fdb6f Mon Sep 17 00:00:00 2001 From: Ching L Date: Tue, 9 Dec 2025 10:58:01 +0800 Subject: [PATCH] feat(crawler): add main category support for better classification - Add CATEGORY_MAPPING dictionary to map sub-categories to main categories - Implement get_main_category function to find parent category - Include main_category field in article data structure - Update toot function to display both main and sub categories intelligently - Avoid duplication when main category is the same as sub category --- crawler.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/crawler.py b/crawler.py index 894c8fd..d955736 100644 --- a/crawler.py +++ b/crawler.py @@ -27,6 +27,30 @@ mastodon_client = Mastodon( api_base_url = 'https://nofan.xyz/' ) +# 主分类和下级分类的映射关系 +CATEGORY_MAPPING = { + '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'], + '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'], + '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'], + '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'], + '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'], + '单车': ['山地车', '公路车', '折叠车', '休旅车'], + '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'], + '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'], + '时尚': ['鞋类', '服饰', '箱包'], + '腕表': ['机械表', '电子表'], + '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'], + '美食': ['当地美食', '世界美食', '私房菜品', '美食器材'] +} + +def get_main_category(sub_category): + """根据下级分类获取主分类""" + for main_category, sub_categories in CATEGORY_MAPPING.items(): + if sub_category in sub_categories: + return main_category + # 如果不是下级分类或者是主分类本身,返回 None + return None + def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] @@ -62,6 +86,8 @@ def crawler(): author = entry.get('author', '未知') # 分类(第一个 tag) category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类' + # 获取主分类 + main_category = get_main_category(category) # 简介内容 content = entry.summary content.removesuffix('...') @@ -85,6 +111,7 @@ def crawler(): 'author': author, 'date': date, 'category': category, + 'main_category': main_category, 'content': content, 'url': url, 'article_id': article_id @@ -110,10 +137,16 @@ def toot(): img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') + # 构建分类标签 + if article.get('main_category'): + category_tags = f"#{article['main_category']} #{article['category']}" + else: + category_tags = f"#{article['category']}" + # toot the article info - toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( + toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'], - category=article['category'], + category_tags=category_tags, author=article['author'], content=article['content'], url=article['url'])