From 6d1fffb63db6f86711e55b364cc4bcf5152fdb6f Mon Sep 17 00:00:00 2001
From: Ching L <loooching@gmail.com>
Date: Tue, 9 Dec 2025 10:58:01 +0800
Subject: [PATCH] feat(crawler): add main category support for better
 classification

- Add CATEGORY_MAPPING dictionary to map sub-categories to main categories
- Implement get_main_category function to find parent category
- Include main_category field in article data structure
- Update toot function to display both main and sub categories intelligently
- Avoid duplication when main category is the same as sub category
---
 crawler.py | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/crawler.py b/crawler.py
index 894c8fd..d955736 100644
--- a/crawler.py
+++ b/crawler.py
@@ -27,6 +27,30 @@ mastodon_client = Mastodon(
   api_base_url = 'https://nofan.xyz/'
 )
 
+# 主分类和下级分类的映射关系
+CATEGORY_MAPPING = {
+    '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
+    '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
+    '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
+    '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
+    '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
+    '单车': ['山地车', '公路车', '折叠车', '休旅车'],
+    '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
+    '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
+    '时尚': ['鞋类', '服饰', '箱包'],
+    '腕表': ['机械表', '电子表'],
+    '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
+    '美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
+}
+
+def get_main_category(sub_category):
+    """根据下级分类获取主分类"""
+    for main_category, sub_categories in CATEGORY_MAPPING.items():
+        if sub_category in sub_categories:
+            return main_category
+    # 如果不是下级分类或者是主分类本身，返回 None
+    return None
+
 
 def save_to_redis(article):
   key = 'chh-article:%s' % article['article_id']
@@ -62,6 +86,8 @@ def crawler():
     author = entry.get('author', '未知')
     # 分类（第一个 tag）
     category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
+    # 获取主分类
+    main_category = get_main_category(category)
     # 简介内容
     content = entry.summary
     content.removesuffix('...')
@@ -85,6 +111,7 @@ def crawler():
       'author': author,
       'date': date,
       'category': category,
+      'main_category': main_category,
       'content': content,
       'url': url,
       'article_id': article_id
@@ -110,10 +137,16 @@ def toot():
     img = scraper.get(article['img_url'], timeout=10)
     # upload article image to mastodon
     media = mastodon_client.media_post(img.content, 'image/jpeg')
+    # 构建分类标签
+    if article.get('main_category'):
+      category_tags = f"#{article['main_category']} #{article['category']}"
+    else:
+      category_tags = f"#{article['category']}"
+
     # toot the article info
-    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
+    toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
       title=article['title'],
-      category=article['category'],
+      category_tags=category_tags,
       author=article['author'],
       content=article['content'],
       url=article['url'])