feat(crawler): add main category support for better classification

- Add CATEGORY_MAPPING dictionary to map sub-categories to main categories - Implement get_main_category function to find parent category - Include main_category field in article data structure - Update toot function to display both main and sub categories intelligently - Avoid duplication when main category is the same as sub category
feat(crawler): add cloudscraper to bypass Cloudflare protection
2025-12-09 10:58:01 +08:00 · 2025-12-05 21:07:40 +08:00 · 2025-04-07 11:33:28 +08:00 · 2025-04-07 11:32:00 +08:00
1 changed files with 72 additions and 73 deletions
--- a/crawler.py
+++ b/crawler.py
@ -6,6 +6,8 @@ from bs4 import BeautifulSoup
 import re
 import redis
 import json
+import feedparser
+import cloudscraper
 from loguru import logger

 from mastodon import Mastodon
@ -13,8 +15,8 @@ from mastodon import Mastodon

 # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
 # logger = logging.getLogger('/root/develop/log/chh-craler.log')
-logger.add('/root/develop/log/chh-craler.log', level='INFO')
-
+logger.add('/home/ching/logs/chh-craler.log', level='INFO')
+scraper = cloudscraper.create_scraper()

 # connect to redis with password
 redis_db = redis.StrictRedis(host="localhost",
@ -25,6 +27,30 @@ mastodon_client = Mastodon(
  api_base_url = 'https://nofan.xyz/'
 )

+# 主分类和下级分类的映射关系
+CATEGORY_MAPPING = {
+    '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
+    '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
+    '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
+    '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
+    '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
+    '单车': ['山地车', '公路车', '折叠车', '休旅车'],
+    '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
+    '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
+    '时尚': ['鞋类', '服饰', '箱包'],
+    '腕表': ['机械表', '电子表'],
+    '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
+    '美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
+}
+
+def get_main_category(sub_category):
+    """根据下级分类获取主分类"""
+    for main_category, sub_categories in CATEGORY_MAPPING.items():
+        if sub_category in sub_categories:
+            return main_category
+    # 如果不是下级分类或者是主分类本身，返回 None
+    return None
+

 def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
@ -46,81 +72,46 @@ def url_shorten(url):
    return url

 def crawler():
-  # get article list in html div class name = "acon cl"
-  home_url = 'https://www.chiphell.com/'
-  # a normal chrome user agent
-  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
-  # get the html page
-  r = requests.get(home_url, headers=headers)
-  # use BeautifulSoup to parse the html page
-  soup = BeautifulSoup(r.text, features="html.parser")
-  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
-  div = soup.find_all('div', class_='chiphell_box cl')[-1]
-  div = div.find('div', class_='acon cl')
-  # articles are in the ul div name = "threadulid"
-  ul = div.find('ul', id='threadulid')
-  # find all the li tags
-  li_list = ul.find_all('li')
-  # a list item is like:
-  # <li>
-  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
-  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
-  # </a>
-  # <div class="tmpad cl">
-  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
-  # <div class="avart">
-  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
-  # <div class="avimain cl">
-  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
-  # </div>
-  # <div class="avimain2 cl">
-  # <span style="padding-left: 0px;">2023/07/16</span>
-  # <span class="avie">3231</span>
-  # <span class="arep">48</span>
-  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
-  # </div>
-  # </div>
-  # <div class="tm04 cl">
-  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
-  # 没有文笔，只有碎碎念。
-  # ROYAL OAK 15551OR
-  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
-  # </div>
-  # </li>
-  # get the article img, title, author, date, category, content and url
-  for li in li_list:
-    # get the article img
-    img = li.find('img')
-    img_url = img['src']
-    # get the article title
-    title = li.find('a', class_='tm03 cl')
-    title = title.text
-    # get the article author
-    author = li.find('a', class_='')
-    author = author.text
-    # get the article date
-    date = li.find('span', style='padding-left: 0px;')
-    date = date.text
-    # get the article category
-    category = li.find('a', class_='asort cl')
-    category = category.text
-    # get the article content
-    content = li.find('div', class_='tm04 cl')
-    content = content.text
-    # get the article url
-    url = li.find('a', class_='tm03 cl')
-    url = home_url + url['href']
-    # get the article id
-    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
+  rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
+  feed = feedparser.parse(rss_url)

+  for entry in feed.entries:
+    # 标题
+    title = entry.title
+    # 链接
+    url = entry.link
+    # 发布时间
+    date = entry.published
+    # 作者
+    author = entry.get('author', '未知')
+    # 分类（第一个 tag）
+    category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
+    # 获取主分类
+    main_category = get_main_category(category)
+    # 简介内容
+    content = entry.summary
+    content.removesuffix('...')

-    # make the article info a dict
+    # 图片链接（从 links 中的 enclosure 找 image/jpeg）
+    img_url = ''
+    for link in entry.get('links', []):
+      if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
+        img_url = link.get('href')
+        img_url = img_url.replace('https://www.chiphell.com/', '')
+        break
+
+    # 提取 article_id（从链接中用正则）
+    article_id_match = re.search(r'article-(\d+)-1\.html', url)
+    article_id = article_id_match.group(1) if article_id_match else 'unknown'
+
+    # 封装成字典
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
+      'main_category': main_category,
      'content': content,
      'url': url,
      'article_id': article_id
@ -128,7 +119,7 @@ def crawler():

    # save the article info to redis
    if save_to_redis(article):
-      print(article)
+      logger.info(article)

 def toot():
  # get all the keys in redis
@ -142,18 +133,26 @@ def toot():
      continue

    # download article image to a temp file
-    img = requests.get(article['img_url'])
+    #img = requests.get(article['img_url'])
+    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
+    # 构建分类标签
+    if article.get('main_category'):
+      category_tags = f"#{article['main_category']} #{article['category']}"
+    else:
+      category_tags = f"#{article['category']}"
+
    # toot the article info
-    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
+    toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
-      category=article['category'],
+      category_tags=category_tags,
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])
+    logger.info('Toot %s' % article['title'])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])
Author	SHA1	Message	Date
Ching L	6d1fffb63d	feat(crawler): add main category support for better classification - Add CATEGORY_MAPPING dictionary to map sub-categories to main categories - Implement get_main_category function to find parent category - Include main_category field in article data structure - Update toot function to display both main and sub categories intelligently - Avoid duplication when main category is the same as sub category	2025-12-09 10:58:01 +08:00
Ching L	3bbe483c64	feat(crawler): add cloudscraper to bypass Cloudflare protection - Replace requests with cloudscraper for image downloading - Update log file path to use home directory logs - Add timeout parameter for image requests to prevent hanging	2025-12-05 21:07:40 +08:00
Ching L	da1969b103	fix(crawler): replace print statements with logger for better logging Updated the crawler to use the logger for outputting article information and toot notifications, enhancing the logging mechanism for improved monitoring and debugging.	2025-04-07 11:33:28 +08:00
Ching L	15addaba24	feat(crawler): update crawler to use RSS feed for article retrieval Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source.	2025-04-07 11:32:00 +08:00