feat(crawler): 增加 chh 爬虫函数和发嘟嘟函数

增加 chh 爬虫函数和发嘟嘟函数 Signed-off-by: Ching <loooching@gmail.com>
2023-07-16 21:20:04 +08:00 · 2023-07-16 21:20:04 +08:00 · 5dfbfa5c57
commit 5dfbfa5c57
parent a8de1b5643
1 changed files with 140 additions and 0 deletions
--- a/crawler.py
+++ b/crawler.py
@ -0,0 +1,140 @@
+### a crawler for the website: https://www.chiphell.com/
+
+
+import requests
+from bs4 import BeautifulSoup
+import re
+import redis
+import json
+
+from mastodon import Mastodon
+
+
+
+# connect to redis with password
+redis_db = redis.StrictRedis(host="localhost",
+                              port=6379, db=0, password='s7LkRNuaLxST5e')
+
+mastodon_client = Mastodon(
+  access_token = '',
+  api_base_url = ''
+)
+
+
+def save_to_redis(article):
+  key = 'chh-article:%s' % article['article_id']
+  if not redis_db.get(key):
+    redis_db.set(key, json.dumps(article), ex=3600*24*7)
+    return True
+
+def crawler():
+  # get article list in html div class name = "acon cl"
+  home_url = 'https://www.chiphell.com/'
+  # a normal chrome user agent
+  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
+  # get the html page
+  r = requests.get(home_url, headers=headers)
+  # use BeautifulSoup to parse the html page
+  soup = BeautifulSoup(r.text, features="html.parser")
+  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
+  div = soup.find_all('div', class_='chiphell_box cl')[-1]
+  div = div.find('div', class_='acon cl')
+  # articles are in the ul div name = "threadulid"
+  ul = div.find('ul', id='threadulid')
+  # find all the li tags
+  li_list = ul.find_all('li')
+  # a list item is like:
+  # <li>
+  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
+  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
+  # </a>
+  # <div class="tmpad cl">
+  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
+  # <div class="avart">
+  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
+  # <div class="avimain cl">
+  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
+  # </div>
+  # <div class="avimain2 cl">
+  # <span style="padding-left: 0px;">2023/07/16</span>
+  # <span class="avie">3231</span>
+  # <span class="arep">48</span>
+  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
+  # </div>
+  # </div>
+  # <div class="tm04 cl">
+  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
+  # 没有文笔，只有碎碎念。
+  # ROYAL OAK 15551OR
+  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
+  # </div>
+  # </li>
+  # get the article img, title, author, date, category, content and url
+  for li in li_list:
+    # get the article img
+    img = li.find('img')
+    img_url = img['src']
+    # get the article title
+    title = li.find('a', class_='tm03 cl')
+    title = title.text
+    # get the article author
+    author = li.find('a', class_='')
+    author = author.text
+    # get the article date
+    date = li.find('span', style='padding-left: 0px;')
+    date = date.text
+    # get the article category
+    category = li.find('a', class_='asort cl')
+    category = category.text
+    # get the article content
+    content = li.find('div', class_='tm04 cl')
+    content = content.text
+    # get the article url
+    url = li.find('a', class_='tm03 cl')
+    url = home_url + url['href']
+    # get the article id
+    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
+
+
+    # make the article info a dict
+    article = {
+      'img_url': img_url,
+      'title': title,
+      'author': author,
+      'date': date,
+      'category': category,
+      'content': content,
+      'url': url,
+      'article_id': article_id
+    }
+
+    # save the article info to redis
+    if save_to_redis(article):
+      print(article)
+
+def toot():
+  # get all the keys in redis
+  keys = redis_db.keys('chh-article:*')
+  # get the article info from redis
+  for key in keys:
+    article = json.loads(redis_db.get(key))
+    # upload article image to mastodon
+    media = mastodon_client.media_post(article['img_url'])
+    # toot the article info
+    toot_content = """{title} - {category} by {author} \n
+                      {content} \n
+                      {url}""".format(title=article['title'],
+                                      category=article['category'],
+                                      author=article['author'],
+                                      content=article['content'],
+                                      url=article['url'])
+
+    mastodon_client.toot(toot_content, media_ids=[media['id']])
+
+
+
+if __name__ == '__main__':
+  crawler()
+
+
+article = {'img_url': 'https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg', 'title': '一点小收藏—AP皇家橡树', 'author': '幼月', 'date': '2023/07/16', 'category': '腕表', 'content': '\n又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！\r\n没有文笔，只有碎碎念。\r\n\r\n\r\nROYAL OAK 15551OR\r\n\r\n女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...', 'url': 'https://www.chiphell.com/article-30010-1.html', 'article_id': '30010'}