feat(crawler): update crawler to use RSS feed for article retrieval

Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source.
2025-04-07 11:32:00 +08:00 · 2025-04-07 11:32:00 +08:00 · 15addaba24
commit 15addaba24
parent c5bf60858c
1 changed files with 30 additions and 67 deletions
--- a/crawler.py
+++ b/crawler.py
@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
 import re
 import redis
 import json
 import feedparser
 from loguru import logger
 from mastodon import Mastodon
@ -46,75 +47,37 @@ def url_shorten(url):
    return url
 def crawler():
-  # get article list in html div class name = "acon cl"
+  rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
-  home_url = 'https://www.chiphell.com/'
+  feed = feedparser.parse(rss_url)
  # a normal chrome user agent
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
  # get the html page
  r = requests.get(home_url, headers=headers)
  # use BeautifulSoup to parse the html page
  soup = BeautifulSoup(r.text, features="html.parser")
  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
  div = soup.find_all('div', class_='chiphell_box cl')[-1]
  div = div.find('div', class_='acon cl')
  # articles are in the ul div name = "threadulid"
  ul = div.find('ul', id='threadulid')
  # find all the li tags
  li_list = ul.find_all('li')
  # a list item is like:
  # <li>
  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
  # </a>
  # <div class="tmpad cl">
  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
  # <div class="avart">
  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
  # <div class="avimain cl">
  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
  # </div>
  # <div class="avimain2 cl">
  # <span style="padding-left: 0px;">2023/07/16</span>
  # <span class="avie">3231</span>
  # <span class="arep">48</span>
  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
  # </div>
  # </div>
  # <div class="tm04 cl">
  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
  # 没有文笔，只有碎碎念。
  # ROYAL OAK 15551OR
  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
  # </div>
  # </li>
  # get the article img, title, author, date, category, content and url
  for li in li_list:
    # get the article img
    img = li.find('img')
    img_url = img['src']
    # get the article title
    title = li.find('a', class_='tm03 cl')
    title = title.text
    # get the article author
    author = li.find('a', class_='')
    author = author.text
    # get the article date
    date = li.find('span', style='padding-left: 0px;')
    date = date.text
    # get the article category
    category = li.find('a', class_='asort cl')
    category = category.text
    # get the article content
    content = li.find('div', class_='tm04 cl')
    content = content.text
    # get the article url
    url = li.find('a', class_='tm03 cl')
    url = home_url + url['href']
    # get the article id
    article_id = re.findall(r'article-(\d+)-1.html', url)[0]
  for entry in feed.entries:
    # 标题
    title = entry.title
    # 链接
    url = entry.link
    # 发布时间
    date = entry.published
    # 作者
    author = entry.get('author', '未知')
    # 分类（第一个 tag）
    category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
    # 简介内容
    content = entry.summary
    content.removesuffix('...')
-    # make the article info a dict
+    # 图片链接（从 links 中的 enclosure 找 image/jpeg）
    img_url = ''
    for link in entry.get('links', []):
      if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
        img_url = link.get('href')
        img_url = img_url.replace('https://www.chiphell.com/', '')
        break
    # 提取 article_id（从链接中用正则）
    article_id_match = re.search(r'article-(\d+)-1\.html', url)
    article_id = article_id_match.group(1) if article_id_match else 'unknown'
    # 封装成字典
    article = {
      'img_url': img_url,
      'title': title,