247 lines
8.2 KiB
Python
247 lines
8.2 KiB
Python
### a crawler for the website: https://www.chiphell.com/
|
||
|
||
|
||
import requests
|
||
import re
|
||
import redis
|
||
import json
|
||
from loguru import logger
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
import time
|
||
|
||
from mastodon import Mastodon
|
||
|
||
|
||
# Configure logger - use local path for macOS
|
||
import os
|
||
log_dir = './logs'
|
||
if not os.path.exists(log_dir):
|
||
os.makedirs(log_dir)
|
||
logger.add('./logs/chh-crawler.log', level='INFO')
|
||
|
||
|
||
# connect to redis with password
|
||
redis_db = redis.StrictRedis(host="localhost",
|
||
port=6379, db=0)
|
||
|
||
mastodon_client = Mastodon(
|
||
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
|
||
api_base_url = 'https://nofan.xyz/'
|
||
)
|
||
|
||
|
||
def save_to_redis(article):
|
||
key = 'chh-article:%s' % article['article_id']
|
||
if not redis_db.get(key):
|
||
article['url'] = url_shorten(article['url'])
|
||
redis_db.set(key, json.dumps(article), ex=3600*24*7)
|
||
return True
|
||
|
||
def url_shorten(url):
|
||
api_url = "https://s.tunpok.com/api/v2/links"
|
||
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
|
||
headers = {
|
||
'x-api-key': api_key,
|
||
}
|
||
resp = requests.post(api_url, headers=headers, json={"target": url})
|
||
if resp.status_code == 201:
|
||
return resp.json()['link']
|
||
else:
|
||
return url
|
||
|
||
def setup_driver():
|
||
"""Configure and initialize Chrome WebDriver"""
|
||
chrome_options = Options()
|
||
chrome_options.add_argument('--headless') # Run in headless mode
|
||
chrome_options.add_argument('--disable-gpu')
|
||
chrome_options.add_argument('--no-sandbox')
|
||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||
|
||
# Set user agent
|
||
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
chrome_options.add_argument(f'user-agent={user_agent}')
|
||
chrome_options.add_argument('--window-size=1920,1080')
|
||
|
||
service = Service(ChromeDriverManager().install())
|
||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
return driver
|
||
|
||
def crawler():
|
||
# Initialize Selenium WebDriver
|
||
home_url = 'https://www.chiphell.com/'
|
||
driver = setup_driver()
|
||
|
||
try:
|
||
logger.info(f"Starting to crawl {home_url}")
|
||
driver.get(home_url)
|
||
|
||
# Wait for page to load
|
||
time.sleep(3)
|
||
|
||
# Debug: log page info
|
||
logger.info(f"Page title: {driver.title}")
|
||
logger.info(f"Current URL: {driver.current_url}")
|
||
page_source = driver.page_source
|
||
logger.info(f"Page source length: {len(page_source)}")
|
||
logger.info(f"Page source preview (first 2000 chars):\n{page_source[:2000]}")
|
||
|
||
# Find all chiphell_box elements and get the last one
|
||
boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box')
|
||
if not boxes:
|
||
logger.error("No chiphell_box elements found")
|
||
return
|
||
|
||
last_box = boxes[-1]
|
||
|
||
# Find the acon div within the last box
|
||
acon_div = last_box.find_element(By.CLASS_NAME, 'acon')
|
||
|
||
# Find the ul with id='threadulid'
|
||
ul = acon_div.find_element(By.ID, 'threadulid')
|
||
|
||
# Find all li elements
|
||
li_list = ul.find_elements(By.TAG_NAME, 'li')
|
||
# a list item is like:
|
||
# <li>
|
||
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
|
||
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
|
||
# </a>
|
||
# <div class="tmpad cl">
|
||
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
|
||
# <div class="avart">
|
||
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
|
||
# <div class="avimain cl">
|
||
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
|
||
# </div>
|
||
# <div class="avimain2 cl">
|
||
# <span style="padding-left: 0px;">2023/07/16</span>
|
||
# <span class="avie">3231</span>
|
||
# <span class="arep">48</span>
|
||
# <a href="portal.php?mod=list&catid=128" target="_blank" class="asort cl">腕表</a>
|
||
# </div>
|
||
# </div>
|
||
# <div class="tm04 cl">
|
||
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
|
||
# 没有文笔,只有碎碎念。
|
||
# ROYAL OAK 15551OR
|
||
# 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...</div>
|
||
# </div>
|
||
# </li>
|
||
# get the article img, title, author, date, category, content and url
|
||
for li in li_list:
|
||
try:
|
||
# get the article img
|
||
img = li.find_element(By.TAG_NAME, 'img')
|
||
img_url = img.get_attribute('src')
|
||
|
||
# get the article title and URL
|
||
title_element = li.find_element(By.CLASS_NAME, 'tm03')
|
||
title = title_element.text
|
||
url = title_element.get_attribute('href')
|
||
if not url.startswith('http'):
|
||
url = home_url + url
|
||
|
||
# get the article id
|
||
article_id_match = re.search(r'article-(\d+)-1\.html', url)
|
||
article_id = article_id_match.group(1) if article_id_match else None
|
||
if not article_id:
|
||
continue
|
||
|
||
# get the article author
|
||
try:
|
||
avimain_div = li.find_element(By.CLASS_NAME, 'avimain')
|
||
author = avimain_div.find_element(By.TAG_NAME, 'a').text
|
||
except:
|
||
author = 'Unknown'
|
||
|
||
# get the article date
|
||
try:
|
||
avimain2_div = li.find_element(By.CLASS_NAME, 'avimain2')
|
||
date_span = avimain2_div.find_element(By.CSS_SELECTOR, 'span[style*="padding-left"]')
|
||
date = date_span.text
|
||
except:
|
||
date = 'Unknown'
|
||
|
||
# get the article category
|
||
try:
|
||
category_element = avimain2_div.find_element(By.CLASS_NAME, 'asort')
|
||
category = category_element.text
|
||
except:
|
||
category = 'Unknown'
|
||
|
||
# get the article content
|
||
try:
|
||
content_element = li.find_element(By.CLASS_NAME, 'tm04')
|
||
content = content_element.text
|
||
except:
|
||
content = 'No preview available'
|
||
|
||
# make the article info a dict
|
||
article = {
|
||
'img_url': img_url,
|
||
'title': title,
|
||
'author': author,
|
||
'date': date,
|
||
'category': category,
|
||
'content': content,
|
||
'url': url,
|
||
'article_id': article_id
|
||
}
|
||
|
||
# save the article info to redis
|
||
if save_to_redis(article):
|
||
print(article)
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error processing article: {e}")
|
||
continue
|
||
|
||
finally:
|
||
# Close the WebDriver
|
||
driver.quit()
|
||
logger.info("WebDriver closed")
|
||
|
||
def toot():
|
||
# get all the keys in redis
|
||
keys = redis_db.keys('chh-article:*')
|
||
# get the article info from redis
|
||
for key in keys:
|
||
article = json.loads(redis_db.get(key))
|
||
# get send article id from redis set 'send-chh-article-id'
|
||
# if the article['id'] is in the set, skip it
|
||
if redis_db.sismember('send-chh-article-id', article['article_id']):
|
||
continue
|
||
|
||
# download article image to a temp file
|
||
img = requests.get(article['img_url'])
|
||
# upload article image to mastodon
|
||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||
# toot the article info
|
||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||
title=article['title'],
|
||
category=article['category'],
|
||
author=article['author'],
|
||
content=article['content'],
|
||
url=article['url'])
|
||
|
||
mastodon_client.status_post(toot_content, media_ids=[media['id']])
|
||
|
||
# add the article['id'] to the set
|
||
redis_db.sadd('send-chh-article-id', article['article_id'])
|
||
break
|
||
|
||
if __name__ == '__main__':
|
||
crawler()
|
||
toot()
|
||
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')
|