From 5dfbfa5c57e8d81911add962de9288c9158ff620 Mon Sep 17 00:00:00 2001 From: Ching Date: Sun, 16 Jul 2023 21:20:04 +0800 Subject: [PATCH] =?UTF-8?q?feat(crawler):=20=E5=A2=9E=E5=8A=A0=20chh=20?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E5=87=BD=E6=95=B0=E5=92=8C=E5=8F=91=E5=98=9F?= =?UTF-8?q?=E5=98=9F=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加 chh 爬虫函数和发嘟嘟函数 Signed-off-by: Ching --- crawler.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..4594197 --- /dev/null +++ b/crawler.py @@ -0,0 +1,140 @@ +### a crawler for the website: https://www.chiphell.com/ + + +import requests +from bs4 import BeautifulSoup +import re +import redis +import json + +from mastodon import Mastodon + + + +# connect to redis with password +redis_db = redis.StrictRedis(host="localhost", + port=6379, db=0, password='s7LkRNuaLxST5e') + +mastodon_client = Mastodon( + access_token = '', + api_base_url = '' +) + + +def save_to_redis(article): + key = 'chh-article:%s' % article['article_id'] + if not redis_db.get(key): + redis_db.set(key, json.dumps(article), ex=3600*24*7) + return True + +def crawler(): + # get article list in html div class name = "acon cl" + home_url = 'https://www.chiphell.com/' + # a normal chrome user agent + headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} + # get the html page + r = requests.get(home_url, headers=headers) + # use BeautifulSoup to parse the html page + soup = BeautifulSoup(r.text, features="html.parser") + # find the div class name = "acon cl" in the last div name = "chiphell_box cl" + div = soup.find_all('div', class_='chiphell_box cl')[-1] + div = div.find('div', class_='acon cl') + # articles are in the ul div name = "threadulid" + ul = div.find('ul', id='threadulid') + # find all the li tags + li_list = ul.find_all('li') + # a list item is like: + #
  • + # + # + # + #
    + # 一点小收藏—AP皇家橡树 + #
    + # + #
    + # 幼月 + #
    + #
    + # 2023/07/16 + # 3231 + # 48 + # 腕表 + #
    + #
    + #
    + # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发! + # 没有文笔,只有碎碎念。 + # ROYAL OAK 15551OR + # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
    + #
    + #
  • + # get the article img, title, author, date, category, content and url + for li in li_list: + # get the article img + img = li.find('img') + img_url = img['src'] + # get the article title + title = li.find('a', class_='tm03 cl') + title = title.text + # get the article author + author = li.find('a', class_='') + author = author.text + # get the article date + date = li.find('span', style='padding-left: 0px;') + date = date.text + # get the article category + category = li.find('a', class_='asort cl') + category = category.text + # get the article content + content = li.find('div', class_='tm04 cl') + content = content.text + # get the article url + url = li.find('a', class_='tm03 cl') + url = home_url + url['href'] + # get the article id + article_id = re.findall(r'article-(\d+)-1.html', url)[0] + + + # make the article info a dict + article = { + 'img_url': img_url, + 'title': title, + 'author': author, + 'date': date, + 'category': category, + 'content': content, + 'url': url, + 'article_id': article_id + } + + # save the article info to redis + if save_to_redis(article): + print(article) + +def toot(): + # get all the keys in redis + keys = redis_db.keys('chh-article:*') + # get the article info from redis + for key in keys: + article = json.loads(redis_db.get(key)) + # upload article image to mastodon + media = mastodon_client.media_post(article['img_url']) + # toot the article info + toot_content = """{title} - {category} by {author} \n + {content} \n + {url}""".format(title=article['title'], + category=article['category'], + author=article['author'], + content=article['content'], + url=article['url']) + + mastodon_client.toot(toot_content, media_ids=[media['id']]) + + + +if __name__ == '__main__': + crawler() + + +article = {'img_url': 'https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg', 'title': '一点小收藏—AP皇家橡树', 'author': '幼月', 'date': '2023/07/16', 'category': '腕表', 'content': '\n又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!\r\n没有文笔,只有碎碎念。\r\n\r\n\r\nROYAL OAK 15551OR\r\n\r\n女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...', 'url': 'https://www.chiphell.com/article-30010-1.html', 'article_id': '30010'}