### a crawler for the website: https://www.chiphell.com/ import requests import re import redis import json from loguru import logger from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import TimeoutException, NoSuchElementException from webdriver_manager.chrome import ChromeDriverManager import time from mastodon import Mastodon # Configure logger - use local path for macOS import os log_dir = './logs' if not os.path.exists(log_dir): os.makedirs(log_dir) logger.add('./logs/chh-crawler.log', level='INFO') # connect to redis with password redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) mastodon_client = Mastodon( access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk', api_base_url = 'https://nofan.xyz/' ) def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] if not redis_db.get(key): article['url'] = url_shorten(article['url']) redis_db.set(key, json.dumps(article), ex=3600*24*7) return True def url_shorten(url): api_url = "https://s.tunpok.com/api/v2/links" api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6" headers = { 'x-api-key': api_key, } resp = requests.post(api_url, headers=headers, json={"target": url}) if resp.status_code == 201: return resp.json()['link'] else: return url def setup_driver(): """Configure and initialize Chrome WebDriver""" chrome_options = Options() chrome_options.add_argument('--headless') # Run in headless mode chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) # Set user agent user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' chrome_options.add_argument(f'user-agent={user_agent}') chrome_options.add_argument('--window-size=1920,1080') service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) return driver def crawler(): # Initialize Selenium WebDriver home_url = 'https://www.chiphell.com/' driver = setup_driver() try: logger.info(f"Starting to crawl {home_url}") driver.get(home_url) # Wait for page to load time.sleep(3) # Debug: log page info logger.info(f"Page title: {driver.title}") logger.info(f"Current URL: {driver.current_url}") page_source = driver.page_source logger.info(f"Page source length: {len(page_source)}") logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}") # Find all chiphell_box elements and get the last one boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box') if not boxes: logger.error("No chiphell_box elements found") return last_box = boxes[-1] # Find the acon div within the last box acon_div = last_box.find_element(By.CLASS_NAME, 'acon') # Find the ul with id='threadulid' ul = acon_div.find_element(By.ID, 'threadulid') # Find all li elements li_list = ul.find_elements(By.TAG_NAME, 'li') # a list item is like: #
  • # # # #
    # 一点小收藏—AP皇家橡树 #
    # #
    # 幼月 #
    #
    # 2023/07/16 # 3231 # 48 # 腕表 #
    #
    #
    # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发! # 没有文笔,只有碎碎念。 # ROYAL OAK 15551OR # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
    #
    #
  • # get the article img, title, author, date, category, content and url for li in li_list: try: # get the article img img = li.find_element(By.TAG_NAME, 'img') img_url = img.get_attribute('src') # get the article title and URL title_element = li.find_element(By.CLASS_NAME, 'tm03') title = title_element.text url = title_element.get_attribute('href') if not url.startswith('http'): url = home_url + url # get the article id article_id_match = re.search(r'article-(\d+)-1\.html', url) article_id = article_id_match.group(1) if article_id_match else None if not article_id: continue # get the article author try: avimain_div = li.find_element(By.CLASS_NAME, 'avimain') author = avimain_div.find_element(By.TAG_NAME, 'a').text except: author = 'Unknown' # get the article date try: avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2') date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]') date = date_span.text except: date = 'Unknown' # get the article category try: category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort') category = category_element.text except: category = 'Unknown' # get the article content try: content_element = li.find_element(By.CLASS_NAME, 'tm04') content = content_element.text except: content = 'No preview available' # make the article info a dict article = { 'img_url': img_url, 'title': title, 'author': author, 'date': date, 'category': category, 'content': content, 'url': url, 'article_id': article_id } # save the article info to redis if save_to_redis(article): print(article) except Exception as e: logger.error(f"Error processing article: {e}") continue finally: # Close the WebDriver driver.quit() logger.info("WebDriver closed") def toot(): # get all the keys in redis keys = redis_db.keys('chh-article:*') # get the article info from redis for key in keys: article = json.loads(redis_db.get(key)) # get send article id from redis set 'send-chh-article-id' # if the article['id'] is in the set, skip it if redis_db.sismember('send-chh-article-id', article['article_id']): continue # download article image to a temp file img = requests.get(article['img_url']) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') # toot the article info toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'], category=article['category'], author=article['author'], content=article['content'], url=article['url']) mastodon_client.status_post(toot_content, media_ids=[media['id']]) # add the article['id'] to the set redis_db.sadd('send-chh-article-id', article['article_id']) break if __name__ == '__main__': crawler() toot() requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')