feat(crawler): 使用 cloudscraper 库进行网页爬取
使用 cloudscraper 库替换 requests 库,以便在网页爬取过程中绕过 tencent 的防护机制。
This commit is contained in:
parent
c5bf60858c
commit
29a8a08622
13
crawler.py
13
crawler.py
@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
|
||||
import re
|
||||
import redis
|
||||
import json
|
||||
import cloudscraper
|
||||
from loguru import logger
|
||||
|
||||
from mastodon import Mastodon
|
||||
@ -25,6 +26,7 @@ mastodon_client = Mastodon(
|
||||
api_base_url = 'https://nofan.xyz/'
|
||||
)
|
||||
|
||||
scraper = cloudscraper.create_scraper()
|
||||
|
||||
def save_to_redis(article):
|
||||
key = 'chh-article:%s' % article['article_id']
|
||||
@ -49,9 +51,10 @@ def crawler():
|
||||
# get article list in html div class name = "acon cl"
|
||||
home_url = 'https://www.chiphell.com/'
|
||||
# a normal chrome user agent
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
||||
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
||||
# get the html page
|
||||
r = requests.get(home_url, headers=headers)
|
||||
# r = requests.get(home_url, headers=headers)
|
||||
r = scraper.get(home_url)
|
||||
# use BeautifulSoup to parse the html page
|
||||
soup = BeautifulSoup(r.text, features="html.parser")
|
||||
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
||||
@ -128,7 +131,7 @@ def crawler():
|
||||
|
||||
# save the article info to redis
|
||||
if save_to_redis(article):
|
||||
print(article)
|
||||
logger.info(article)
|
||||
|
||||
def toot():
|
||||
# get all the keys in redis
|
||||
@ -142,9 +145,11 @@ def toot():
|
||||
continue
|
||||
|
||||
# download article image to a temp file
|
||||
img = requests.get(article['img_url'])
|
||||
# img = requests.get(article['img_url'])
|
||||
img = scraper.get(article['img_url'], timeout=10)
|
||||
# upload article image to mastodon
|
||||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||||
logger.info('Toot %s' % article['title'])
|
||||
# toot the article info
|
||||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||||
title=article['title'],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user