diff --git a/crawler.py b/crawler.py
index 6899733..e51dbf7 100644
--- a/crawler.py
+++ b/crawler.py
@@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
import re
import redis
import json
+import feedparser
from loguru import logger
from mastodon import Mastodon
@@ -46,75 +47,37 @@ def url_shorten(url):
return url
def crawler():
- # get article list in html div class name = "acon cl"
- home_url = 'https://www.chiphell.com/'
- # a normal chrome user agent
- headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
- # get the html page
- r = requests.get(home_url, headers=headers)
- # use BeautifulSoup to parse the html page
- soup = BeautifulSoup(r.text, features="html.parser")
- # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
- div = soup.find_all('div', class_='chiphell_box cl')[-1]
- div = div.find('div', class_='acon cl')
- # articles are in the ul div name = "threadulid"
- ul = div.find('ul', id='threadulid')
- # find all the li tags
- li_list = ul.find_all('li')
- # a list item is like:
- #
- #
- #
- #
- #
- #
一点小收藏—AP皇家橡树
- #
- #

- #
- #
- #
2023/07/16
- #
3231
- #
48
- #
腕表
- #
- #
- #
- # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
- # 没有文笔,只有碎碎念。
- # ROYAL OAK 15551OR
- # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
- #
- #
- # get the article img, title, author, date, category, content and url
- for li in li_list:
- # get the article img
- img = li.find('img')
- img_url = img['src']
- # get the article title
- title = li.find('a', class_='tm03 cl')
- title = title.text
- # get the article author
- author = li.find('a', class_='')
- author = author.text
- # get the article date
- date = li.find('span', style='padding-left: 0px;')
- date = date.text
- # get the article category
- category = li.find('a', class_='asort cl')
- category = category.text
- # get the article content
- content = li.find('div', class_='tm04 cl')
- content = content.text
- # get the article url
- url = li.find('a', class_='tm03 cl')
- url = home_url + url['href']
- # get the article id
- article_id = re.findall(r'article-(\d+)-1.html', url)[0]
+ rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
+ feed = feedparser.parse(rss_url)
+ for entry in feed.entries:
+ # 标题
+ title = entry.title
+ # 链接
+ url = entry.link
+ # 发布时间
+ date = entry.published
+ # 作者
+ author = entry.get('author', '未知')
+ # 分类(第一个 tag)
+ category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
+ # 简介内容
+ content = entry.summary
+ content.removesuffix('...')
- # make the article info a dict
+ # 图片链接(从 links 中的 enclosure 找 image/jpeg)
+ img_url = ''
+ for link in entry.get('links', []):
+ if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
+ img_url = link.get('href')
+ img_url = img_url.replace('https://www.chiphell.com/', '')
+ break
+
+ # 提取 article_id(从链接中用正则)
+ article_id_match = re.search(r'article-(\d+)-1\.html', url)
+ article_id = article_id_match.group(1) if article_id_match else 'unknown'
+
+ # 封装成字典
article = {
'img_url': img_url,
'title': title,