### a crawler for the website: https://www.chiphell.com/ import requests import re import redis import json from loguru import logger from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException, NoSuchElementException import time from mastodon import Mastodon # Configure logger - use local path for macOS import os log_dir = './logs' if not os.path.exists(log_dir): os.makedirs(log_dir) logger.add('./logs/chh-crawler.log', level='INFO') # connect to redis with password redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) mastodon_client = Mastodon( access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk', api_base_url = 'https://nofan.xyz/' ) def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] if not redis_db.get(key): article['url'] = url_shorten(article['url']) redis_db.set(key, json.dumps(article), ex=3600*24*7) return True def url_shorten(url): api_url = "https://s.tunpok.com/api/v2/links" api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6" headers = { 'x-api-key': api_key, } resp = requests.post(api_url, headers=headers, json={"target": url}) if resp.status_code == 201: return resp.json()['link'] else: return url def setup_driver(): """Configure and initialize Chrome WebDriver""" chrome_options = Options() chrome_options.add_argument('--headless') # Run in headless mode chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) # Set user agent user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' chrome_options.add_argument(f'user-agent={user_agent}') chrome_options.add_argument('--window-size=1920,1080') driver = webdriver.Chrome(options=chrome_options) return driver def crawler(): # Initialize Selenium WebDriver home_url = 'https://www.chiphell.com/' driver = setup_driver() try: logger.info(f"Starting to crawl {home_url}") driver.get(home_url) # Wait for page to load time.sleep(3) # Find all chiphell_box elements and get the last one boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box') if not boxes: logger.error("No chiphell_box elements found") return last_box = boxes[-1] # Find the acon div within the last box acon_div = last_box.find_element(By.CLASS_NAME, 'acon') # Find the ul with id='threadulid' ul = acon_div.find_element(By.ID, 'threadulid') # Find all li elements li_list = ul.find_elements(By.TAG_NAME, 'li') # a list item is like: #
#
#