chh-crawler/crawler.py

### a crawler for the website: https://www.chiphell.com/


import requests
import re
import redis
import json
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time

from mastodon import Mastodon


# Configure logger - use local path for macOS
import os
log_dir = './logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
logger.add('./logs/chh-crawler.log', level='INFO')


# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
                              port=6379, db=0)

mastodon_client = Mastodon(
  access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
  api_base_url = 'https://nofan.xyz/'
)


def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
  if not redis_db.get(key):
    article['url'] = url_shorten(article['url'])
    redis_db.set(key, json.dumps(article), ex=3600*24*7)
    return True

def url_shorten(url):
  api_url = "https://s.tunpok.com/api/v2/links"
  api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
  headers = {
    'x-api-key': api_key,
  }
  resp = requests.post(api_url, headers=headers, json={"target": url})
  if resp.status_code == 201:
    return resp.json()['link']
  else:
    return url

def setup_driver():
  """Configure and initialize Chrome WebDriver"""
  chrome_options = Options()
  chrome_options.add_argument('--headless')  # Run in headless mode
  chrome_options.add_argument('--disable-gpu')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  chrome_options.add_experimental_option('useAutomationExtension', False)

  # Set user agent
  user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
  chrome_options.add_argument(f'user-agent={user_agent}')
  chrome_options.add_argument('--window-size=1920,1080')

  service = Service(ChromeDriverManager().install())
  driver = webdriver.Chrome(service=service, options=chrome_options)
  return driver

def crawler():
  # Initialize Selenium WebDriver
  home_url = 'https://www.chiphell.com/'
  driver = setup_driver()

  try:
    logger.info(f"Starting to crawl {home_url}")
    driver.get(home_url)

    # Wait for page to load
    time.sleep(3)

    # Debug: log page info
    logger.info(f"Page title: {driver.title}")
    logger.info(f"Current URL: {driver.current_url}")
    page_source = driver.page_source
    logger.info(f"Page source length: {len(page_source)}")
    logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")

    # Find all chiphell_box elements and get the last one
    boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
    if not boxes:
      logger.error("No chiphell_box elements found")
      return

    last_box = boxes[-1]

    # Find the acon div within the last box
    acon_div = last_box.find_element(By.CLASS_NAME, 'acon')

    # Find the ul with id='threadulid'
    ul = acon_div.find_element(By.ID, 'threadulid')

    # Find all li elements
    li_list = ul.find_elements(By.TAG_NAME, 'li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
  # </a>
  # <div class="tmpad cl">
  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
  # <div class="avart">
  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
  # <div class="avimain cl">
  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
  # </div>
  # <div class="avimain2 cl">
  # <span style="padding-left: 0px;">2023/07/16</span>
  # <span class="avie">3231</span>
  # <span class="arep">48</span>
  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
  # </div>
  # </div>
  # <div class="tm04 cl">
  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
  # 没有文笔，只有碎碎念。
  # ROYAL OAK 15551OR
  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
  # </div>
  # </li>
    # get the article img, title, author, date, category, content and url
    for li in li_list:
      try:
        # get the article img
        img = li.find_element(By.TAG_NAME, 'img')
        img_url = img.get_attribute('src')

        # get the article title and URL
        title_element = li.find_element(By.CLASS_NAME, 'tm03')
        title = title_element.text
        url = title_element.get_attribute('href')
        if not url.startswith('http'):
          url = home_url + url

        # get the article id
        article_id_match = re.search(r'article-(\d+)-1\.html', url)
        article_id = article_id_match.group(1) if article_id_match else None
        if not article_id:
          continue

        # get the article author
        try:
          avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
          author = avimain_div.find_element(By.TAG_NAME, 'a').text
        except:
          author = 'Unknown'

        # get the article date
        try:
          avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
          date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
          date = date_span.text
        except:
          date = 'Unknown'

        # get the article category
        try:
          category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
          category = category_element.text
        except:
          category = 'Unknown'

        # get the article content
        try:
          content_element = li.find_element(By.CLASS_NAME, 'tm04')
          content = content_element.text
        except:
          content = 'No preview available'

        # make the article info a dict
        article = {
          'img_url': img_url,
          'title': title,
          'author': author,
          'date': date,
          'category': category,
          'content': content,
          'url': url,
          'article_id': article_id
        }

        # save the article info to redis
        if save_to_redis(article):
          print(article)

      except Exception as e:
        logger.error(f"Error processing article: {e}")
        continue

  finally:
    # Close the WebDriver
    driver.quit()
    logger.info("WebDriver closed")

def toot():
  # get all the keys in redis
  keys = redis_db.keys('chh-article:*')
  # get the article info from redis
  for key in keys:
    article = json.loads(redis_db.get(key))
    # get send article id from redis set 'send-chh-article-id'
    # if the article['id'] is in the set, skip it
    if redis_db.sismember('send-chh-article-id', article['article_id']):
      continue

    # download article image to a temp file
    img = requests.get(article['img_url'])
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
    # toot the article info
    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
      category=article['category'],
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])
    break

if __name__ == '__main__':
  crawler()
  toot()
  requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')