chh-crawler/crawler.py

### a crawler for the website: https://www.chiphell.com/


import requests
from bs4 import BeautifulSoup
import re
import redis
import json
import feedparser
import cloudscraper
from loguru import logger

from mastodon import Mastodon


# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/home/ching/logs/chh-craler.log', level='INFO')
scraper = cloudscraper.create_scraper()

# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
                              port=6379, db=0)

mastodon_client = Mastodon(
  access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
  api_base_url = 'https://nofan.xyz/'
)


def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
  if not redis_db.get(key):
    article['url'] = url_shorten(article['url'])
    redis_db.set(key, json.dumps(article), ex=3600*24*7)
    return True

def url_shorten(url):
  api_url = "https://s.tunpok.com/api/v2/links"
  api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
  headers = {
    'x-api-key': api_key,
  }
  resp = requests.post(api_url, headers=headers, json={"target": url})
  if resp.status_code == 201:
    return resp.json()['link']
  else:
    return url

def crawler():
  rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
  feed = feedparser.parse(rss_url)

  for entry in feed.entries:
    # 标题
    title = entry.title
    # 链接
    url = entry.link
    # 发布时间
    date = entry.published
    # 作者
    author = entry.get('author', '未知')
    # 分类（第一个 tag）
    category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
    # 简介内容
    content = entry.summary
    content.removesuffix('...')

    # 图片链接（从 links 中的 enclosure 找 image/jpeg）
    img_url = ''
    for link in entry.get('links', []):
      if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
        img_url = link.get('href')
        img_url = img_url.replace('https://www.chiphell.com/', '')
        break

    # 提取 article_id（从链接中用正则）
    article_id_match = re.search(r'article-(\d+)-1\.html', url)
    article_id = article_id_match.group(1) if article_id_match else 'unknown'

    # 封装成字典
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
      'content': content,
      'url': url,
      'article_id': article_id
    }

    # save the article info to redis
    if save_to_redis(article):
      logger.info(article)

def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')
  # get the article info from redis
  for key in keys:
    article = json.loads(redis_db.get(key))
    # get send article id from redis set 'send-chh-article-id'
    # if the article['id'] is in the set, skip it
    if redis_db.sismember('send-chh-article-id', article['article_id']):
      continue

    # download article image to a temp file
    #img = requests.get(article['img_url'])
    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
    # toot the article info
    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
      category=article['category'],
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])
    logger.info('Toot %s' % article['title'])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])
    break

if __name__ == '__main__':
  crawler()
  toot()
  requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')