chh-crawler/crawler.py
Ching 5dfbfa5c57 feat(crawler): 增加 chh 爬虫函数和发嘟嘟函数
增加 chh 爬虫函数和发嘟嘟函数

Signed-off-by: Ching <loooching@gmail.com>
2023-07-16 21:20:04 +08:00

141 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

### a crawler for the website: https://www.chiphell.com/
import requests
from bs4 import BeautifulSoup
import re
import redis
import json
from mastodon import Mastodon
# connect to redis with password
redis_db = redis.StrictRedis(host="localhost",
port=6379, db=0, password='s7LkRNuaLxST5e')
mastodon_client = Mastodon(
access_token = '',
api_base_url = ''
)
def save_to_redis(article):
key = 'chh-article:%s' % article['article_id']
if not redis_db.get(key):
redis_db.set(key, json.dumps(article), ex=3600*24*7)
return True
def crawler():
# get article list in html div class name = "acon cl"
home_url = 'https://www.chiphell.com/'
# a normal chrome user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page
r = requests.get(home_url, headers=headers)
# use BeautifulSoup to parse the html page
soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
div = soup.find_all('div', class_='chiphell_box cl')[-1]
div = div.find('div', class_='acon cl')
# articles are in the ul div name = "threadulid"
ul = div.find('ul', id='threadulid')
# find all the li tags
li_list = ul.find_all('li')
# a list item is like:
# <li>
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
# </a>
# <div class="tmpad cl">
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
# <div class="avart">
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
# <div class="avimain cl">
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
# </div>
# <div class="avimain2 cl">
# <span style="padding-left: 0px;">2023/07/16</span>
# <span class="avie">3231</span>
# <span class="arep">48</span>
# <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
# </div>
# </div>
# <div class="tm04 cl">
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
# 没有文笔,只有碎碎念。
# ROYAL OAK 15551OR
# 女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...</div>
# </div>
# </li>
# get the article img, title, author, date, category, content and url
for li in li_list:
# get the article img
img = li.find('img')
img_url = img['src']
# get the article title
title = li.find('a', class_='tm03 cl')
title = title.text
# get the article author
author = li.find('a', class_='')
author = author.text
# get the article date
date = li.find('span', style='padding-left: 0px;')
date = date.text
# get the article category
category = li.find('a', class_='asort cl')
category = category.text
# get the article content
content = li.find('div', class_='tm04 cl')
content = content.text
# get the article url
url = li.find('a', class_='tm03 cl')
url = home_url + url['href']
# get the article id
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
# make the article info a dict
article = {
'img_url': img_url,
'title': title,
'author': author,
'date': date,
'category': category,
'content': content,
'url': url,
'article_id': article_id
}
# save the article info to redis
if save_to_redis(article):
print(article)
def toot():
# get all the keys in redis
keys = redis_db.keys('chh-article:*')
# get the article info from redis
for key in keys:
article = json.loads(redis_db.get(key))
# upload article image to mastodon
media = mastodon_client.media_post(article['img_url'])
# toot the article info
toot_content = """{title} - {category} by {author} \n
{content} \n
{url}""".format(title=article['title'],
category=article['category'],
author=article['author'],
content=article['content'],
url=article['url'])
mastodon_client.toot(toot_content, media_ids=[media['id']])
if __name__ == '__main__':
crawler()
article = {'img_url': 'https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg', 'title': '一点小收藏—AP皇家橡树', 'author': '幼月', 'date': '2023/07/16', 'category': '腕表', 'content': '\n又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!\r\n没有文笔,只有碎碎念。\r\n\r\n\r\nROYAL OAK 15551OR\r\n\r\n女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...', 'url': 'https://www.chiphell.com/article-30010-1.html', 'article_id': '30010'}