diff --git a/crawler.py b/crawler.py index 6899733..75914eb 100644 --- a/crawler.py +++ b/crawler.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup import re import redis import json +import cloudscraper from loguru import logger from mastodon import Mastodon @@ -25,6 +26,7 @@ mastodon_client = Mastodon( api_base_url = 'https://nofan.xyz/' ) +scraper = cloudscraper.create_scraper() def save_to_redis(article): key = 'chh-article:%s' % article['article_id'] @@ -49,9 +51,10 @@ def crawler(): # get article list in html div class name = "acon cl" home_url = 'https://www.chiphell.com/' # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} + # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} # get the html page - r = requests.get(home_url, headers=headers) + # r = requests.get(home_url, headers=headers) + r = scraper.get(home_url) # use BeautifulSoup to parse the html page soup = BeautifulSoup(r.text, features="html.parser") # find the div class name = "acon cl" in the last div name = "chiphell_box cl" @@ -128,7 +131,7 @@ def crawler(): # save the article info to redis if save_to_redis(article): - print(article) + logger.info(article) def toot(): # get all the keys in redis @@ -142,9 +145,11 @@ def toot(): continue # download article image to a temp file - img = requests.get(article['img_url']) + # img = requests.get(article['img_url']) + img = scraper.get(article['img_url'], timeout=10) # upload article image to mastodon media = mastodon_client.media_post(img.content, 'image/jpeg') + logger.info('Toot %s' % article['title']) # toot the article info toot_content = """{title} - #{category} by {author} \n {content} \n\n {url} \n\n #chh #chiphell \n """.format( title=article['title'],