chh-crawler/crawler.py

### a crawler for the website: https://www.chiphell.com/


import requests
from bs4 import BeautifulSoup
import re
import redis
import json
import cloudscraper
from loguru import logger

from mastodon import Mastodon


# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
logger.add('/root/develop/log/chh-craler.log', level='INFO')


# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
                              port=6379, db=0)

mastodon_client = Mastodon(
  access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
  api_base_url = 'https://nofan.xyz/'
)

scraper = cloudscraper.create_scraper()

def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
  if not redis_db.get(key):
    article['url'] = url_shorten(article['url'])
    redis_db.set(key, json.dumps(article), ex=3600*24*7)
    return True

def url_shorten(url):
  api_url = "https://s.tunpok.com/api/v2/links"
  api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
  headers = {
    'x-api-key': api_key,
  }
  resp = requests.post(api_url, headers=headers, json={"target": url})
  if resp.status_code == 201:
    return resp.json()['link']
  else:
    return url

def crawler():
  # get article list in html div class name = "acon cl"
  home_url = 'https://www.chiphell.com/'
  # a normal chrome user agent
  # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
  # get the html page
  # r = requests.get(home_url, headers=headers)
  r = scraper.get(home_url)
  # use BeautifulSoup to parse the html page
  soup = BeautifulSoup(r.text, features="html.parser")
  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
  div = soup.find_all('div', class_='chiphell_box cl')[-1]
  div = div.find('div', class_='acon cl')
  # articles are in the ul div name = "threadulid"
  ul = div.find('ul', id='threadulid')
  # find all the li tags
  li_list = ul.find_all('li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
  # </a>
  # <div class="tmpad cl">
  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
  # <div class="avart">
  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
  # <div class="avimain cl">
  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
  # </div>
  # <div class="avimain2 cl">
  # <span style="padding-left: 0px;">2023/07/16</span>
  # <span class="avie">3231</span>
  # <span class="arep">48</span>
  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
  # </div>
  # </div>
  # <div class="tm04 cl">
  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
  # 没有文笔，只有碎碎念。
  # ROYAL OAK 15551OR
  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
  # </div>
  # </li>
  # get the article img, title, author, date, category, content and url
  for li in li_list:
    # get the article img
    img = li.find('img')
    img_url = img['src']
    # get the article title
    title = li.find('a', class_='tm03 cl')
    title = title.text
    # get the article author
    author = li.find('a', class_='')
    author = author.text
    # get the article date
    date = li.find('span', style='padding-left: 0px;')
    date = date.text
    # get the article category
    category = li.find('a', class_='asort cl')
    category = category.text
    # get the article content
    content = li.find('div', class_='tm04 cl')
    content = content.text
    # get the article url
    url = li.find('a', class_='tm03 cl')
    url = home_url + url['href']
    # get the article id
    article_id = re.findall(r'article-(\d+)-1.html', url)[0]


    # make the article info a dict
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
      'content': content,
      'url': url,
      'article_id': article_id
    }

    # save the article info to redis
    if save_to_redis(article):
      logger.info(article)

def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')
  # get the article info from redis
  for key in keys:
    article = json.loads(redis_db.get(key))
    # get send article id from redis set 'send-chh-article-id'
    # if the article['id'] is in the set, skip it
    if redis_db.sismember('send-chh-article-id', article['article_id']):
      continue

    # download article image to a temp file
    # img = requests.get(article['img_url'])
    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
    logger.info('Toot %s' % article['title'])
    # toot the article info
    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
      category=article['category'],
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])
    break

if __name__ == '__main__':
  crawler()
  toot()
  requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')