feat(crawler): update crawler to use RSS feed for article retrieval
Replaced HTML scraping with RSS feed parsing to fetch article details including title, URL, author, date, category, content, and image link. This improves reliability and efficiency in gathering articles from the source.
This commit is contained in:
parent
c5bf60858c
commit
15addaba24
97
crawler.py
97
crawler.py
@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
|
|||||||
import re
|
import re
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
|
import feedparser
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
@ -46,75 +47,37 @@ def url_shorten(url):
|
|||||||
return url
|
return url
|
||||||
|
|
||||||
def crawler():
|
def crawler():
|
||||||
# get article list in html div class name = "acon cl"
|
rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
|
||||||
home_url = 'https://www.chiphell.com/'
|
feed = feedparser.parse(rss_url)
|
||||||
# a normal chrome user agent
|
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
|
||||||
# get the html page
|
|
||||||
r = requests.get(home_url, headers=headers)
|
|
||||||
# use BeautifulSoup to parse the html page
|
|
||||||
soup = BeautifulSoup(r.text, features="html.parser")
|
|
||||||
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
|
||||||
div = soup.find_all('div', class_='chiphell_box cl')[-1]
|
|
||||||
div = div.find('div', class_='acon cl')
|
|
||||||
# articles are in the ul div name = "threadulid"
|
|
||||||
ul = div.find('ul', id='threadulid')
|
|
||||||
# find all the li tags
|
|
||||||
li_list = ul.find_all('li')
|
|
||||||
# a list item is like:
|
|
||||||
# <li>
|
|
||||||
# <a href="article-30010-1.html" target="_blank" class="tm01 cl">
|
|
||||||
# <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
|
|
||||||
# </a>
|
|
||||||
# <div class="tmpad cl">
|
|
||||||
# <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
|
|
||||||
# <div class="avart">
|
|
||||||
# <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
|
|
||||||
# <div class="avimain cl">
|
|
||||||
# <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
|
|
||||||
# </div>
|
|
||||||
# <div class="avimain2 cl">
|
|
||||||
# <span style="padding-left: 0px;">2023/07/16</span>
|
|
||||||
# <span class="avie">3231</span>
|
|
||||||
# <span class="arep">48</span>
|
|
||||||
# <a href="portal.php?mod=list&catid=128" target="_blank" class="asort cl">腕表</a>
|
|
||||||
# </div>
|
|
||||||
# </div>
|
|
||||||
# <div class="tm04 cl">
|
|
||||||
# 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发!
|
|
||||||
# 没有文笔,只有碎碎念。
|
|
||||||
# ROYAL OAK 15551OR
|
|
||||||
# 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...</div>
|
|
||||||
# </div>
|
|
||||||
# </li>
|
|
||||||
# get the article img, title, author, date, category, content and url
|
|
||||||
for li in li_list:
|
|
||||||
# get the article img
|
|
||||||
img = li.find('img')
|
|
||||||
img_url = img['src']
|
|
||||||
# get the article title
|
|
||||||
title = li.find('a', class_='tm03 cl')
|
|
||||||
title = title.text
|
|
||||||
# get the article author
|
|
||||||
author = li.find('a', class_='')
|
|
||||||
author = author.text
|
|
||||||
# get the article date
|
|
||||||
date = li.find('span', style='padding-left: 0px;')
|
|
||||||
date = date.text
|
|
||||||
# get the article category
|
|
||||||
category = li.find('a', class_='asort cl')
|
|
||||||
category = category.text
|
|
||||||
# get the article content
|
|
||||||
content = li.find('div', class_='tm04 cl')
|
|
||||||
content = content.text
|
|
||||||
# get the article url
|
|
||||||
url = li.find('a', class_='tm03 cl')
|
|
||||||
url = home_url + url['href']
|
|
||||||
# get the article id
|
|
||||||
article_id = re.findall(r'article-(\d+)-1.html', url)[0]
|
|
||||||
|
|
||||||
|
for entry in feed.entries:
|
||||||
|
# 标题
|
||||||
|
title = entry.title
|
||||||
|
# 链接
|
||||||
|
url = entry.link
|
||||||
|
# 发布时间
|
||||||
|
date = entry.published
|
||||||
|
# 作者
|
||||||
|
author = entry.get('author', '未知')
|
||||||
|
# 分类(第一个 tag)
|
||||||
|
category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
|
||||||
|
# 简介内容
|
||||||
|
content = entry.summary
|
||||||
|
content.removesuffix('...')
|
||||||
|
|
||||||
# make the article info a dict
|
# 图片链接(从 links 中的 enclosure 找 image/jpeg)
|
||||||
|
img_url = ''
|
||||||
|
for link in entry.get('links', []):
|
||||||
|
if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
|
||||||
|
img_url = link.get('href')
|
||||||
|
img_url = img_url.replace('https://www.chiphell.com/', '')
|
||||||
|
break
|
||||||
|
|
||||||
|
# 提取 article_id(从链接中用正则)
|
||||||
|
article_id_match = re.search(r'article-(\d+)-1\.html', url)
|
||||||
|
article_id = article_id_match.group(1) if article_id_match else 'unknown'
|
||||||
|
|
||||||
|
# 封装成字典
|
||||||
article = {
|
article = {
|
||||||
'img_url': img_url,
|
'img_url': img_url,
|
||||||
'title': title,
|
'title': title,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user