feat(crawler): 使用 cloudscraper 库进行网页爬取

使用 cloudscraper 库替换 requests 库，以便在网页爬取过程中绕过 tencent 的防护机制。
2025-04-07 17:46:21 +08:00
1 changed files with 75 additions and 69 deletions
--- a/crawler.py
+++ b/crawler.py
@ -6,7 +6,6 @@ from bs4 import BeautifulSoup
 import re
 import redis
 import json
-import feedparser
 import cloudscraper
 from loguru import logger

@ -15,8 +14,8 @@ from mastodon import Mastodon

 # logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO)
 # logger = logging.getLogger('/root/develop/log/chh-craler.log')
-logger.add('/home/ching/logs/chh-craler.log', level='INFO')
-scraper = cloudscraper.create_scraper()
+logger.add('/root/develop/log/chh-craler.log', level='INFO')
+

 # connect to redis with password
 redis_db = redis.StrictRedis(host="localhost",
@ -27,30 +26,7 @@ mastodon_client = Mastodon(
  api_base_url = 'https://nofan.xyz/'
 )

-# 主分类和下级分类的映射关系
-CATEGORY_MAPPING = {
-    '照片': ['人物肖像', '风景游记', '微距静物', '人文扫街', '动物植物', '其它作品'],
-    '电脑': ['配件开箱', '整机搭建', '升级改造', '桌面书房', 'QNAP专区'],
-    '掌设': ['智能手机', '智能穿戴', '笔电平板', '周边附件', 'ThinkPad专区'],
-    '摄影': ['微单卡片', '单反单电', '经典旁轴', '怀旧菲林', '影音摄像', '周边附件'],
-    '汽车': ['买菜车', '商务车', '性能车', '旅行车', 'SUV', 'MPV', '摩托轻骑', '改装配件', 'CHH Auto Club'],
-    '单车': ['山地车', '公路车', '折叠车', '休旅车'],
-    '模型': ['人偶手办', '比例成品', '拼装自组', 'RC遥控'],
-    '败家': ['收藏爱好', '品质生活', '数码前沿', '小资情调', '女王最大', '吃喝玩乐', '育儿分享'],
-    '时尚': ['鞋类', '服饰', '箱包'],
-    '腕表': ['机械表', '电子表'],
-    '视听': ['耳机耳放', '音箱功放', '解码转盘', '随身设备', '唱片录音'],
-    '美食': ['当地美食', '世界美食', '私房菜品', '美食器材']
-}
-
-def get_main_category(sub_category):
-    """根据下级分类获取主分类"""
-    for main_category, sub_categories in CATEGORY_MAPPING.items():
-        if sub_category in sub_categories:
-            return main_category
-    # 如果不是下级分类或者是主分类本身，返回 None
-    return None
-
+scraper = cloudscraper.create_scraper()

 def save_to_redis(article):
  key = 'chh-article:%s' % article['article_id']
@ -72,46 +48,82 @@ def url_shorten(url):
    return url

 def crawler():
-  rss_url = 'https://www.chiphell.com/portal.php?mod=rss'
-  feed = feedparser.parse(rss_url)
+  # get article list in html div class name = "acon cl"
+  home_url = 'https://www.chiphell.com/'
+  # a normal chrome user agent
+  # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
+  # get the html page
+  # r = requests.get(home_url, headers=headers)
+  r = scraper.get(home_url)
+  # use BeautifulSoup to parse the html page
+  soup = BeautifulSoup(r.text, features="html.parser")
+  # find the div class name = "acon cl" in the last div name = "chiphell_box cl"
+  div = soup.find_all('div', class_='chiphell_box cl')[-1]
+  div = div.find('div', class_='acon cl')
+  # articles are in the ul div name = "threadulid"
+  ul = div.find('ul', id='threadulid')
+  # find all the li tags
+  li_list = ul.find_all('li')
+  # a list item is like:
+  # <li>
+  # <a href="article-30010-1.html" target="_blank" class="tm01 cl">
+  # <img src="https://static.chiphell.com/portal/202307/16/080741btbodlblx8nwzn74.jpg">
+  # </a>
+  # <div class="tmpad cl">
+  # <a href="article-30010-1.html" target="_blank" class="tm03 cl">一点小收藏—AP皇家橡树</a>
+  # <div class="avart">
+  # <a href="space-username-幼月.html" target="_blank" class="tmava"><img src="./data/avatar/000/27/86/96_avatar_small.jpg"></a>
+  # <div class="avimain cl">
+  # <a href="space-username-幼月.html" target="_blank" class="">幼月</a>
+  # </div>
+  # <div class="avimain2 cl">
+  # <span style="padding-left: 0px;">2023/07/16</span>
+  # <span class="avie">3231</span>
+  # <span class="arep">48</span>
+  # <a href="portal.php?mod=list&amp;catid=128" target="_blank" class="asort cl">腕表</a>
+  # </div>
+  # </div>
+  # <div class="tm04 cl">
+  # 又是我胡汉三，最近不知道怎么捅了腕表品牌的窝了。。三天两头给我塞东西。。所以作业按照混乱发！
+  # 没有文笔，只有碎碎念。
+  # ROYAL OAK 15551OR
+  # 女王的第一块AP，最早许愿是蓝盘但是到的是白盘，不过白盘也非常美  ...</div>
+  # </div>
+  # </li>
+  # get the article img, title, author, date, category, content and url
+  for li in li_list:
+    # get the article img
+    img = li.find('img')
+    img_url = img['src']
+    # get the article title
+    title = li.find('a', class_='tm03 cl')
+    title = title.text
+    # get the article author
+    author = li.find('a', class_='')
+    author = author.text
+    # get the article date
+    date = li.find('span', style='padding-left: 0px;')
+    date = date.text
+    # get the article category
+    category = li.find('a', class_='asort cl')
+    category = category.text
+    # get the article content
+    content = li.find('div', class_='tm04 cl')
+    content = content.text
+    # get the article url
+    url = li.find('a', class_='tm03 cl')
+    url = home_url + url['href']
+    # get the article id
+    article_id = re.findall(r'article-(\d+)-1.html', url)[0]

-  for entry in feed.entries:
-    # 标题
-    title = entry.title
-    # 链接
-    url = entry.link
-    # 发布时间
-    date = entry.published
-    # 作者
-    author = entry.get('author', '未知')
-    # 分类（第一个 tag）
-    category = entry.tags[0]['term'] if 'tags' in entry and entry.tags else '未分类'
-    # 获取主分类
-    main_category = get_main_category(category)
-    # 简介内容
-    content = entry.summary
-    content.removesuffix('...')

-    # 图片链接（从 links 中的 enclosure 找 image/jpeg）
-    img_url = ''
-    for link in entry.get('links', []):
-      if link.get('type', '') == 'image/jpeg' and link.get('rel') == 'enclosure':
-        img_url = link.get('href')
-        img_url = img_url.replace('https://www.chiphell.com/', '')
-        break
-
-    # 提取 article_id（从链接中用正则）
-    article_id_match = re.search(r'article-(\d+)-1\.html', url)
-    article_id = article_id_match.group(1) if article_id_match else 'unknown'
-
-    # 封装成字典
+    # make the article info a dict
    article = {
      'img_url': img_url,
      'title': title,
      'author': author,
      'date': date,
      'category': category,
-      'main_category': main_category,
      'content': content,
      'url': url,
      'article_id': article_id
@ -133,26 +145,20 @@ def toot():
      continue

    # download article image to a temp file
-    #img = requests.get(article['img_url'])
+    # img = requests.get(article['img_url'])
    img = scraper.get(article['img_url'], timeout=10)
    # upload article image to mastodon
    media = mastodon_client.media_post(img.content, 'image/jpeg')
-    # 构建分类标签
-    if article.get('main_category'):
-      category_tags = f"#{article['main_category']} #{article['category']}"
-    else:
-      category_tags = f"#{article['category']}"
-
+    logger.info('Toot %s' % article['title'])
    # toot the article info
-    toot_content = """{title} - {category_tags} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
+    toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
      title=article['title'],
-      category_tags=category_tags,
+      category=article['category'],
      author=article['author'],
      content=article['content'],
      url=article['url'])

    mastodon_client.status_post(toot_content, media_ids=[media['id']])
-    logger.info('Toot %s' % article['title'])

    # add the article['id'] to the set
    redis_db.sadd('send-chh-article-id', article['article_id'])