feat(crawler): update crawler to use RSS feed for article retrieval

Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source.
This commit is contained in:
Ching L 2025-04-07 11:32:00 +08:00
parent c5bf60858c
commit 15addaba24

View File

@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
import re
import redis
import json
import feedparser
from loguru import logger
from mastodon import Mastodon
@ -46,75 +47,37 @@ def url_shorten(url):
return url
def crawler():
# get article list in html div class name = "acon cl"
home_url = 'https://www.chiphell.com/'
# a normal chrome user agent
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
# get the html page
r = requests.get(home_url, headers=headers)
# use BeautifulSoup to parse the html page
soup = BeautifulSoup(r.text, features="html.parser")
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
div = soup.find_all('div', class_='chiphell_box cl')[-1]
div = div.find('div', class_='acon cl')
# articles are in the ul div name = "threadulid"
ul = div.find('ul', id='threadulid')
# find all the li tags
li_list = ul.find_all('li')
# a list item is like:
# <li>
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
# </a>
# <div class="tmpad cl">
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
# <div class="avart">
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
# <div class="avimain cl">
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
# </div>
# <div class="avimain2 cl">
# <span style="padding-left: 0px;">2023/07/16</span>
# <span class="avie">3231</span>
# <span class="arep">48</span>
# <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
# </div>
# </div>
# <div class="tm04 cl">
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
# 没有文笔,只有碎碎念。
# ROYAL OAK 15551OR
# 女王的第一块AP最早许愿是蓝盘但是到的是白盘不过白盘也非常美 ...</div>
# </div>
# </li>
# get the article img, title, author, date, category, content and url
for li in li_list:
# get the article img
img = li.find('img')
img_url = img['src']
# get the article title
title = li.find('a', class_='tm03 cl')
title = title.text
# get the article author
author = li.find('a', class_='')
author = author.text
# get the article date
date = li.find('span', style='padding-left: 0px;')
date = date.text
# get the article category
category = li.find('a', class_='asort cl')
category = category.text
# get the article content
content = li.find('div', class_='tm04 cl')
content = content.text
# get the article url
url = li.find('a', class_='tm03 cl')
url = home_url + url['href']
# get the article id
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
feed = feedparser.parse(rss_url)
for entry in feed.entries:
# 标题
title = entry.title
# 链接
url = entry.link
# 发布时间
date = entry.published
# 作者
author = entry.get('author', '未知')
# 分类(第一个 tag
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
# 简介内容
content = entry.summary
content.removesuffix('...')
# make the article info a dict
# 图片链接(从 links 中的 enclosure 找 image/jpeg
img_url = ''
for link in entry.get('links', []):
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
img_url = link.get('href')
img_url = img_url.replace('https://www.chiphell.com/', '')
break
# 提取 article_id从链接中用正则
article_id_match = re.search(r'article-(\d+)-1\.html', url)
article_id = article_id_match.group(1) if article_id_match else 'unknown'
# 封装成字典
article = {
'img_url': img_url,
'title': title,