feat(crawler): 使用 cloudscraper 库进行网页爬取
使用 cloudscraper 库替换 requests 库,以便在网页爬取过程中绕过 tencent 的防护机制。
This commit is contained in:
parent
c5bf60858c
commit
29a8a08622
13
crawler.py
13
crawler.py
@ -6,6 +6,7 @@ from bs4 import BeautifulSoup
|
|||||||
import re
|
import re
|
||||||
import redis
|
import redis
|
||||||
import json
|
import json
|
||||||
|
import cloudscraper
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
@ -25,6 +26,7 @@ mastodon_client = Mastodon(
|
|||||||
api_base_url = 'https://nofan.xyz/'
|
api_base_url = 'https://nofan.xyz/'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
def save_to_redis(article):
|
def save_to_redis(article):
|
||||||
key = 'chh-article:%s' % article['article_id']
|
key = 'chh-article:%s' % article['article_id']
|
||||||
@ -49,9 +51,10 @@ def crawler():
|
|||||||
# get article list in html div class name = "acon cl"
|
# get article list in html div class name = "acon cl"
|
||||||
home_url = 'https://www.chiphell.com/'
|
home_url = 'https://www.chiphell.com/'
|
||||||
# a normal chrome user agent
|
# a normal chrome user agent
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
# headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'}
|
||||||
# get the html page
|
# get the html page
|
||||||
r = requests.get(home_url, headers=headers)
|
# r = requests.get(home_url, headers=headers)
|
||||||
|
r = scraper.get(home_url)
|
||||||
# use BeautifulSoup to parse the html page
|
# use BeautifulSoup to parse the html page
|
||||||
soup = BeautifulSoup(r.text, features="html.parser")
|
soup = BeautifulSoup(r.text, features="html.parser")
|
||||||
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
# find the div class name = "acon cl" in the last div name = "chiphell_box cl"
|
||||||
@ -128,7 +131,7 @@ def crawler():
|
|||||||
|
|
||||||
# save the article info to redis
|
# save the article info to redis
|
||||||
if save_to_redis(article):
|
if save_to_redis(article):
|
||||||
print(article)
|
logger.info(article)
|
||||||
|
|
||||||
def toot():
|
def toot():
|
||||||
# get all the keys in redis
|
# get all the keys in redis
|
||||||
@ -142,9 +145,11 @@ def toot():
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# download article image to a temp file
|
# download article image to a temp file
|
||||||
img = requests.get(article['img_url'])
|
# img = requests.get(article['img_url'])
|
||||||
|
img = scraper.get(article['img_url'], timeout=10)
|
||||||
# upload article image to mastodon
|
# upload article image to mastodon
|
||||||
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
media = mastodon_client.media_post(img.content, 'image/jpeg')
|
||||||
|
logger.info('Toot %s' % article['title'])
|
||||||
# toot the article info
|
# toot the article info
|
||||||
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format(
|
||||||
title=article['title'],
|
title=article['title'],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user