diff --git a/crawler.py b/crawler.py index 6899733..e51dbf7 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup import re import redis import json +import feedparser from loguru import logger from mastodon import Mastodon @@ -46,75 +47,37 @@ def url_shorten(url): return url def crawler(): - # get article list in html div class name = "acon cl" - home_url = 'https://www.chiphell.com/' - # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} - # get the html page - r = requests.get(home_url, headers=headers) - # use BeautifulSoup to parse the html page - soup = BeautifulSoup(r.text, features="html.parser") - # find the div class name = "acon cl" in the last div name = "chiphell_box cl" - div = soup.find_all('div', class_='chiphell_box cl')[-1] - div = div.find('div', class_='acon cl') - # articles are in the ul div name = "threadulid" - ul = div.find('ul', id='threadulid') - # find all the li tags - li_list = ul.find_all('li') - # a list item is like: - #
  • - # - # - # - #
    - # 一点小收藏—AP皇家橡树 - #
    - # - #
    - # 幼月 - #
    - #
    - # 2023/07/16 - # 3231 - # 48 - # 腕表 - #
    - #
    - #
    - # 又是我胡汉三,最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发! - # 没有文笔,只有碎碎念。 - # ROYAL OAK 15551OR - # 女王的第一块AP,最早许愿是蓝盘但是到的是白盘,不过白盘也非常美 ...
    - #
    - #
  • - # get the article img, title, author, date, category, content and url - for li in li_list: - # get the article img - img = li.find('img') - img_url = img['src'] - # get the article title - title = li.find('a', class_='tm03 cl') - title = title.text - # get the article author - author = li.find('a', class_='') - author = author.text - # get the article date - date = li.find('span', style='padding-left: 0px;') - date = date.text - # get the article category - category = li.find('a', class_='asort cl') - category = category.text - # get the article content - content = li.find('div', class_='tm04 cl') - content = content.text - # get the article url - url = li.find('a', class_='tm03 cl') - url = home_url + url['href'] - # get the article id - article_id = re.findall(r'article-(\d+)-1.html', url)[0] + rss_url = 'https://www.chiphell.com/portal.php?mod=rss' + feed = feedparser.parse(rss_url) + for entry in feed.entries: + # 标题 + title = entry.title + # 链接 + url = entry.link + # 发布时间 + date = entry.published + # 作者 + author = entry.get('author', '未知') + # 分类(第一个 tag) + category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类' + # 简介内容 + content = entry.summary + content.removesuffix('...') - # make the article info a dict + # 图片链接(从 links 中的 enclosure 找 image/jpeg) + img_url = '' + for link in entry.get('links', []): + if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure': + img_url = link.get('href') + img_url = img_url.replace('https://www.chiphell.com/', '') + break + + # 提取 article_id(从链接中用正则) + article_id_match = re.search(r'article-(\d+)-1\.html', url) + article_id = article_id_match.group(1) if article_id_match else 'unknown' + + # 封装成字典 article = { 'img_url': img_url, 'title': title,