171 lines
5.7 KiB
Python
171 lines
5.7 KiB
Python
### a crawler for the website: https://www.chiphell.com/
|
||
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import redis
|
||
import json
|
||
import cloudscraper
|
||
from loguru import logger
|
||
|
||
from mastodon import Mastodon
|
||
|
||
|
||
# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
|
||
# logger = logging.getLogger('/root/develop/log/chh-craler.log')
|
||
logger.add('/root/develop/log/chh-craler.log', level='INFO')
|
||
|
||
|
||
# connect to redis with password
|
||
redis_db = redis.StrictRedis(host="localhost",
|
||
port=6379, db=0)
|
||
|
||
mastodon_client = Mastodon(
|
||
access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk',
|
||
api_base_url = 'https://nofan.xyz/'
|
||
)
|
||
|
||
scraper = cloudscraper.create_scraper()
|
||
|
||
def save_to_redis(article):
|
||
key = 'chh-article:%s' % article['article_id']
|
||
if not redis_db.get(key):
|
||
article['url'] = url_shorten(article['url'])
|
||
redis_db.set(key, json.dumps(article), ex=3600*24*7)
|
||
return True
|
||
|
||
def url_shorten(url):
|
||
api_url = "https://s.tunpok.com/api/v2/links"
|
||
api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6"
|
||
headers = {
|
||
'x-api-key': api_key,
|
||
}
|
||
resp = requests.post(api_url, headers=headers, json={"target": url})
|
||
if resp.status_code == 201:
|
||
return resp.json()['link']
|
||
else:
|
||
return url
|
||
|
||
def crawler():
|
||
# get article list in html div class name = "acon cl"
|
||
home_url = 'https://www.chiphell.com/'
|
||
# a normal chrome user agent
|
||
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
||
# get the html page
|
||
# r = requests.get(home_url, headers=headers)
|
||
r = scraper.get(home_url)
|
||
# use BeautifulSoup to parse the html page
|
||
soup = BeautifulSoup(r.text, features="html.parser")
|
||
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
||
div = soup.find_all('div', class_='chiphell_box cl')[-1]
|
||
div = div.find('div', class_='acon cl')
|
||
# articles are in the ul div name = "threadulid"
|
||
ul = div.find('ul', id='threadulid')
|
||
# find all the li tags
|
||
li_list = ul.find_all('li')
|
||
# a list item is like:
|
||
# <li>
|
||
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
|
||
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
|
||
# </a>
|
||
# <div class="tmpad cl">
|
||
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
|
||
# <div class="avart">
|
||
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
|
||
# <div class="avimain cl">
|
||
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
|
||
# </div>
|
||
# <div class="avimain2 cl">
|
||
# <span style="padding-left: 0px;">2023/07/16</span>
|
||
# <span class="avie">3231</span>
|
||
# <span class="arep">48</span>
|
||
# <a href="portal.php?mod=list&catid=128" target="_blank" class="asort cl">腕表</a>
|
||
# </div>
|
||
# </div>
|
||
# <div class="tm04 cl">
|
||
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
|
||
# 没有文笔,只有碎碎念。
|
||
# ROYAL OAK 15551OR
|
||
# 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...</div>
|
||
# </div>
|
||
# </li>
|
||
# get the article img, title, author, date, category, content and url
|
||
for li in li_list:
|
||
# get the article img
|
||
img = li.find('img')
|
||
img_url = img['src']
|
||
# get the article title
|
||
title = li.find('a', class_='tm03 cl')
|
||
title = title.text
|
||
# get the article author
|
||
author = li.find('a', class_='')
|
||
author = author.text
|
||
# get the article date
|
||
date = li.find('span', style='padding-left: 0px;')
|
||
date = date.text
|
||
# get the article category
|
||
category = li.find('a', class_='asort cl')
|
||
category = category.text
|
||
# get the article content
|
||
content = li.find('div', class_='tm04 cl')
|
||
content = content.text
|
||
# get the article url
|
||
url = li.find('a', class_='tm03 cl')
|
||
url = home_url + url['href']
|
||
# get the article id
|
||
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
|
||
|
||
|
||
# make the article info a dict
|
||
article = {
|
||
'img_url': img_url,
|
||
'title': title,
|
||
'author': author,
|
||
'date': date,
|
||
'category': category,
|
||
'content': content,
|
||
'url': url,
|
||
'article_id': article_id
|
||
}
|
||
|
||
# save the article info to redis
|
||
if save_to_redis(article):
|
||
logger.info(article)
|
||
|
||
def toot():
|
||
# get all the keys in redis
|
||
keys = redis_db.keys('chh-article:*')
|
||
# get the article info from redis
|
||
for key in keys:
|
||
article = json.loads(redis_db.get(key))
|
||
# get send article id from redis set 'send-chh-article-id'
|
||
# if the article['id'] is in the set, skip it
|
||
if redis_db.sismember('send-chh-article-id', article['article_id']):
|
||
continue
|
||
|
||
# download article image to a temp file
|
||
# img = requests.get(article['img_url'])
|
||
img = scraper.get(article['img_url'], timeout=10)
|
||
# upload article image to mastodon
|
||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||
logger.info('Toot %s' % article['title'])
|
||
# toot the article info
|
||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||
title=article['title'],
|
||
category=article['category'],
|
||
author=article['author'],
|
||
content=article['content'],
|
||
url=article['url'])
|
||
|
||
mastodon_client.status_post(toot_content, media_ids=[media['id']])
|
||
|
||
# add the article['id'] to the set
|
||
redis_db.sadd('send-chh-article-id', article['article_id'])
|
||
break
|
||
|
||
if __name__ == '__main__':
|
||
crawler()
|
||
toot()
|
||
requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')
|