chh-crawler/crawler.py

### a crawler for the website: https://www.chiphell.com/


import requests
from bs4 import BeautifulSoup
import re
import redis
import json
import feedparser
import cloudscraper
from loguru import logger

from mastodon import Mastodon


# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
scraper = cloudscraper.create_scraper()

# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
                              port=6379, db=0)

mastodon_client = Mastodon(
  access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
  api_base_url = 'https://nofan.xyz/'
)

# 主分类和下级分类的映射关系
CATEGORY_MAPPING = {
    '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
    '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
    '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
    '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
    '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
    '单车': ['山地车', '公路车', '折叠车', '休旅车'],
    '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
    '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
    '时尚': ['鞋类', '服饰', '箱包'],
    '腕表': ['机械表', '电子表'],
    '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
    '美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
}

def get_main_category(sub_category):
    """根据下级分类获取主分类"""
    for main_category, sub_categories in CATEGORY_MAPPING.items():
        if sub_category in sub_categories:
            return main_category
    # 如果不是下级分类或者是主分类本身，返回 None
    return None


def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
  if not redis_db.get(key):
    article['url'] = url_shorten(article['url'])
    redis_db.set(key, json.dumps(article), ex=3600*24*7)
    return True

def url_shorten(url):
  api_url = "https://s.tunpok.com/api/v2/links"
  api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
  headers = {
    'x-api-key': api_key,
  }
  resp = requests.post(api_url, headers=headers, json={"target": url})
  if resp.status_code == 201:
    return resp.json()['link']
  else:
    return url

def crawler():
  rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
  feed = feedparser.parse(rss_url)

  for entry in feed.entries:
    # 标题
    title = entry.title
    # 链接
    url = entry.link
    # 发布时间
    date = entry.published
    # 作者
    author = entry.get('author', '未知')
    # 分类（第一个 tag）
    category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
    # 获取主分类
    main_category = get_main_category(category)
    # 简介内容
    content = entry.summary
    content.removesuffix('...')

    # 图片链接（从 links 中的 enclosure 找 image/jpeg）
    img_url = ''
    for link in entry.get('links', []):
      if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
        img_url = link.get('href')
        img_url = img_url.replace('https://www.chiphell.com/', '')
        break

    # 提取 article_id（从链接中用正则）
    article_id_match = re.search(r'article-(\d+)-1\.html', url)
    article_id = article_id_match.group(1) if article_id_match else 'unknown'

    # 封装成字典
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
      'main_category': main_category,
      'content': content,
      'url': url,
      'article_id': article_id
    }

    # save the article info to redis
    if save_to_redis(article):
      logger.info(article)

def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')
  # get the article info from redis
  for key in keys:
    article = json.loads(redis_db.get(key))
    # get send article id from redis set 'send-chh-article-id'
    # if the article['id'] is in the set, skip it
    if redis_db.sismember('send-chh-article-id', article['article_id']):
      continue

    # download article image to a temp file
    #img = requests.get(article['img_url'])
    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
    # 构建分类标签
    if article.get('main_category'):
      category_tags = f"#{article['main_category']} #{article['category']}"
    else:
      category_tags = f"#{article['category']}"

    # toot the article info
    toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
      category_tags=category_tags,
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])
    logger.info('Toot %s' % article['title'])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])
    break

if __name__ == '__main__':
  crawler()
  toot()
  requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')