chh-crawler/crawler.py

247 lines
8.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### a crawler for the website: https://www.chiphell.com/
import requests
import re
import redis
import json
from loguru import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import time
from mastodon import Mastodon
# Configure logger - use local path for macOS
import os
log_dir = './logs'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
logger.add('./logs/chh-crawler.log', level='INFO')
# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
port=6379, db=0)
mastodon_client = Mastodon(
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
api_base_url = 'https://nofan.xyz/'
)
def save_to_redis(article):
key = 'chh-article:%s' % article['article_id']
if not redis_db.get(key):
article['url'] = url_shorten(article['url'])
redis_db.set(key, json.dumps(article), ex=3600*24*7)
return True
def url_shorten(url):
api_url = "https://s.tunpok.com/api/v2/links"
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
headers = {
'x-api-key': api_key,
}
resp = requests.post(api_url, headers=headers, json={"target": url})
if resp.status_code == 201:
return resp.json()['link']
else:
return url
def setup_driver():
"""Configure and initialize Chrome WebDriver"""
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
# Set user agent
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
chrome_options.add_argument('--window-size=1920,1080')
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver
def crawler():
# Initialize Selenium WebDriver
home_url = 'https://www.chiphell.com/'
driver = setup_driver()
try:
logger.info(f"Starting to crawl {home_url}")
driver.get(home_url)
# Wait for page to load
time.sleep(3)
# Debug: log page info
logger.info(f"Page title: {driver.title}")
logger.info(f"Current URL: {driver.current_url}")
page_source = driver.page_source
logger.info(f"Page source length: {len(page_source)}")
logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")
# Find all chiphell_box elements and get the last one
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
if not boxes:
logger.error("No chiphell_box elements found")
return
last_box = boxes[-1]
# Find the acon div within the last box
acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
# Find the ul with id='threadulid'
ul = acon_div.find_element(By.ID, 'threadulid')
# Find all li elements
li_list = ul.find_elements(By.TAG_NAME, 'li')
# a list item is like:
# <li>
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
# </a>
# <div class="tmpad cl">
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
# <div class="avart">
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
# <div class="avimain cl">
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
# </div>
# <div class="avimain2 cl">
# <span style="padding-left: 0px;">2023/07/16</span>
# <span class="avie">3231</span>
# <span class="arep">48</span>
# <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
# </div>
# </div>
# <div class="tm04 cl">
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
# 没有文笔,只有碎碎念。
# ROYAL OAK 15551OR
# 女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...</div>
# </div>
# </li>
# get the article img, title, author, date, category, content and url
for li in li_list:
try:
# get the article img
img = li.find_element(By.TAG_NAME, 'img')
img_url = img.get_attribute('src')
# get the article title and URL
title_element = li.find_element(By.CLASS_NAME, 'tm03')
title = title_element.text
url = title_element.get_attribute('href')
if not url.startswith('http'):
url = home_url + url
# get the article id
article_id_match = re.search(r'article-(\d+)-1\.html', url)
article_id = article_id_match.group(1) if article_id_match else None
if not article_id:
continue
# get the article author
try:
avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
author = avimain_div.find_element(By.TAG_NAME, 'a').text
except:
author = 'Unknown'
# get the article date
try:
avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
date = date_span.text
except:
date = 'Unknown'
# get the article category
try:
category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
category = category_element.text
except:
category = 'Unknown'
# get the article content
try:
content_element = li.find_element(By.CLASS_NAME, 'tm04')
content = content_element.text
except:
content = 'No preview available'
# make the article info a dict
article = {
'img_url': img_url,
'title': title,
'author': author,
'date': date,
'category': category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis
if save_to_redis(article):
print(article)
except Exception as e:
logger.error(f"Error processing article: {e}")
continue
finally:
# Close the WebDriver
driver.quit()
logger.info("WebDriver closed")
def toot():
# get all the keys in redis
keys = redis_db.keys('chh-article:*')
# get the article info from redis
for key in keys:
article = json.loads(redis_db.get(key))
# get send article id from redis set 'send-chh-article-id'
# if the article['id'] is in the set, skip it
if redis_db.sismember('send-chh-article-id', article['article_id']):
continue
# download article image to a temp file
img = requests.get(article['img_url'])
# upload article image to mastodon
media = mastodon_client.media_post(img.content, 'image/jpeg')
# toot the article info
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
title=article['title'],
category=article['category'],
author=article['author'],
content=article['content'],
url=article['url'])
mastodon_client.status_post(toot_content, media_ids=[media['id']])
# add the article['id'] to the set
redis_db.sadd('send-chh-article-id', article['article_id'])
break
if __name__ == '__main__':
crawler()
toot()
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')