From 15addaba24208210cce63ae1bd35c1f45282a69c Mon Sep 17 00:00:00 2001 From: Ching L Date: Mon, 7 Apr 2025 11:32:00 +0800 Subject: [PATCH] feat(crawler): update crawler to use RSS feed for article retrieval Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source. --- crawler.py | 97 +++++++++++++++++------------------------------------- 1 file changed, 30 insertions(+), 67 deletions(-) diff --git a/crawler.py b/crawler.py index 6899733..e51dbf7 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup import re import redis import json +import feedparser from loguru import logger from mastodon import Mastodon @@ -46,75 +47,37 @@ def url_shorten(url): return url def crawler(): - # get article list in html div class name = "acon cl" - home_url = 'https://www.chiphell.com/' - # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} - # get the html page - r = requests.get(home_url, headers=headers) - # use BeautifulSoup to parse the html page - soup = BeautifulSoup(r.text, features="html.parser") - # find the div class name = "acon cl" in the last div name = "chiphell_box cl" - div = soup.find_all('div', class_='chiphell_box cl')[-1] - div = div.find('div', class_='acon cl') - # articles are in the ul div name = "threadulid" - ul = div.find('ul', id='threadulid') - # find all the li tags - li_list = ul.find_all('li') - # a list item is like: - #
  • - # - # - # - #
    - # 一点小收藏—AP皇家橡树 - #
    - # - #
    - # 幼月 - #
    - #
    - # 2023/07/16 - # 3231 - # 48 - # 腕表 - #
    - #
    - #
    - # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发! - # 没有文笔,只有碎碎念。 - # ROYAL OAK 15551OR - # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
    - #
    - #
  • - # get the article img, title, author, date, category, content and url - for li in li_list: - # get the article img - img = li.find('img') - img_url = img['src'] - # get the article title - title = li.find('a', class_='tm03 cl') - title = title.text - # get the article author - author = li.find('a', class_='') - author = author.text - # get the article date - date = li.find('span', style='padding-left: 0px;') - date = date.text - # get the article category - category = li.find('a', class_='asort cl') - category = category.text - # get the article content - content = li.find('div', class_='tm04 cl') - content = content.text - # get the article url - url = li.find('a', class_='tm03 cl') - url = home_url + url['href'] - # get the article id - article_id = re.findall(r'article-(\d+)-1.html', url)[0] + rss_url = 'https://www.chiphell.com/portal.php?mod=rss' + feed = feedparser.parse(rss_url) + for entry in feed.entries: + # 标题 + title = entry.title + # 链接 + url = entry.link + # 发布时间 + date = entry.published + # 作者 + author = entry.get('author', '未知') + # 分类(第一个 tag) + category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类' + # 简介内容 + content = entry.summary + content.removesuffix('...') - # make the article info a dict + # 图片链接(从 links 中的 enclosure 找 image/jpeg) + img_url = '' + for link in entry.get('links', []): + if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure': + img_url = link.get('href') + img_url = img_url.replace('https://www.chiphell.com/', '') + break + + # 提取 article_id(从链接中用正则) + article_id_match = re.search(r'article-(\d+)-1\.html', url) + article_id = article_id_match.group(1) if article_id_match else 'unknown' + + # 封装成字典 article = { 'img_url': img_url, 'title': title,