### a crawler for the website: https://www.chiphell.com/ import requests from bs4 import BeautifulSoup import re import redis import json import cloudscraper from loguru import logger from mastodon import Mastodon # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) # logger = logging.getLogger('/root/develop/log/chh-craler.log') logger.add('/root/develop/log/chh-craler.log', level='INFO') # connect to redis with password redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) mastodon_client = Mastodon( access_token = '8LIqGXmerhP8QABT3ppe-1HDATfbmJ-8iDY1_QxNkjk', api_base_url = 'https://nofan.xyz/' ) scraper = cloudscraper.create_scraper() def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] if not redis_db.get(key): article['url'] = url_shorten(article['url']) redis_db.set(key, json.dumps(article), ex=3600*24*7) return True def url_shorten(url): api_url = "https://s.tunpok.com/api/v2/links" api_key = "HJAs1~1sjd5mX66_Ydg8~9oSuqgkd75VeLhjjni6" headers = { 'x-api-key': api_key, } resp = requests.post(api_url, headers=headers, json={"target": url}) if resp.status_code == 201: return resp.json()['link'] else: return url def crawler(): # get article list in html div class name = "acon cl" home_url = 'https://www.chiphell.com/' # a normal chrome user agent # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} # get the html page # r = requests.get(home_url, headers=headers) r = scraper.get(home_url) # use BeautifulSoup to parse the html page soup = BeautifulSoup(r.text, features="html.parser") # find the div class name = "acon cl" in the last div name = "chiphell_box cl" div = soup.find_all('div', class_='chiphell_box cl')[-1] div = div.find('div', class_='acon cl') # articles are in the ul div name = "threadulid" ul = div.find('ul', id='threadulid') # find all the li tags li_list = ul.find_all('li') # a list item is like: #
  • # # # #
    # 一点小收藏—AP皇家橡树 #
    # #
    # 幼月 #
    #
    # 2023/07/16 # 3231 # 48 # 腕表 #
    #
    #
    # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发! # 没有文笔,只有碎碎念。 # ROYAL OAK 15551OR # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
    #
    #
  • # get the article img, title, author, date, category, content and url for li in li_list: # get the article img img = li.find('img') img_url = img['src'] # get the article title title = li.find('a', class_='tm03 cl') title = title.text # get the article author author = li.find('a', class_='') author = author.text # get the article date date = li.find('span', style='padding-left: 0px;') date = date.text # get the article category category = li.find('a', class_='asort cl') category = category.text # get the article content content = li.find('div', class_='tm04 cl') content = content.text # get the article url url = li.find('a', class_='tm03 cl') url = home_url + url['href'] # get the article id article_id = re.findall(r'article-(\d+)-1.html', url)[0] # make the article info a dict article = { 'img_url': img_url, 'title': title, 'author': author, 'date': date, 'category': category, 'content': content, 'url': url, 'article_id': article_id } # save the article info to redis if save_to_redis(article): logger.info(article) def toot(): # get all the keys in redis keys = redis_db.keys('chh-article:*') # get the article info from redis for key in keys: article = json.loads(redis_db.get(key)) # get send article id from redis set 'send-chh-article-id' # if the article['id'] is in the set, skip it if redis_db.sismember('send-chh-article-id', article['article_id']): continue # download article image to a temp file # img = requests.get(article['img_url']) img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') logger.info('Toot %s' % article['title']) # toot the article info toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'], category=article['category'], author=article['author'], content=article['content'], url=article['url']) mastodon_client.status_post(toot_content, media_ids=[media['id']]) # add the article['id'] to the set redis_db.sadd('send-chh-article-id', article['article_id']) break if __name__ == '__main__': crawler() toot() requests.get('https://up.tunpok.com/api/push/yoLZhc0rZR?status=up&msg=OK&ping=')