chh-crawler/crawler.py

### a crawler for the website: https://www.chiphell.com/


import requests
from bs4 import BeautifulSoup
import re
import redis
import json

from mastodon import Mastodon


# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
                              port=6379, db=0, password='s7LkRNuaLxST5e')

mastodon_client = Mastodon(
  access_token = '',
  api_base_url = ''
)


def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
  if not redis_db.get(key):
    redis_db.set(key, json.dumps(article), ex=3600*24*7)
    return True

def crawler():
  # get article list in html div class name = "acon cl"
  home_url = 'https://www.chiphell.com/'
  # a normal chrome user agent
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
  # get the html page
  r = requests.get(home_url, headers=headers)
  # use BeautifulSoup to parse the html page
  soup = BeautifulSoup(r.text, features="html.parser")
  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
  div = soup.find_all('div', class_='chiphell_box cl')[-1]
  div = div.find('div', class_='acon cl')
  # articles are in the ul div name = "threadulid"
  ul = div.find('ul', id='threadulid')
  # find all the li tags
  li_list = ul.find_all('li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
  # </a>
  # <div class="tmpad cl">
  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
  # <div class="avart">
  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
  # <div class="avimain cl">
  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
  # </div>
  # <div class="avimain2 cl">
  # <span style="padding-left: 0px;">2023/07/16</span>
  # <span class="avie">3231</span>
  # <span class="arep">48</span>
  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
  # </div>
  # </div>
  # <div class="tm04 cl">
  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
  # 没有文笔，只有碎碎念。
  # ROYAL OAK 15551OR
  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
  # </div>
  # </li>
  # get the article img, title, author, date, category, content and url
  for li in li_list:
    # get the article img
    img = li.find('img')
    img_url = img['src']
    # get the article title
    title = li.find('a', class_='tm03 cl')
    title = title.text
    # get the article author
    author = li.find('a', class_='')
    author = author.text
    # get the article date
    date = li.find('span', style='padding-left: 0px;')
    date = date.text
    # get the article category
    category = li.find('a', class_='asort cl')
    category = category.text
    # get the article content
    content = li.find('div', class_='tm04 cl')
    content = content.text
    # get the article url
    url = li.find('a', class_='tm03 cl')
    url = home_url + url['href']
    # get the article id
    article_id = re.findall(r'article-(\d+)-1.html', url)[0]


    # make the article info a dict
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
      'content': content,
      'url': url,
      'article_id': article_id
    }

    # save the article info to redis
    if save_to_redis(article):
      print(article)

def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')
  # get the article info from redis
  for key in keys:
    article = json.loads(redis_db.get(key))
    # upload article image to mastodon
    media = mastodon_client.media_post(article['img_url'])
    # toot the article info
    toot_content = """{title} - {category} by {author} \n
                      {content} \n
                      {url}""".format(title=article['title'],
                                      category=article['category'],
                                      author=article['author'],
                                      content=article['content'],
                                      url=article['url'])

    mastodon_client.toot(toot_content, media_ids=[media['id']])


if __name__ == '__main__':
  crawler()


article = {'img_url': 'https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg', 'title': '一点小收藏—AP皇家橡树', 'author': '幼月', 'date': '2023/07/16', 'category': '腕表', 'content': '\n又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！\r\n没有文笔，只有碎碎念。\r\n\r\n\r\nROYAL OAK 15551OR\r\n\r\n女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...', 'url': 'https://www.chiphell.com/article-30010-1.html', 'article_id': '30010'}