diff --git a/crawler.py b/crawler.py index 6899733..f341277 100644 --- a/crawler.py +++ b/crawler.py @@ -2,18 +2,27 @@ import requests -from bs4 import BeautifulSoup import re import redis import json from loguru import logger +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import TimeoutException, NoSuchElementException +import time from mastodon import Mastodon -# logging.basicConfig(filename='/root/develop/log/chh-craler.log', level=logging.INFO) -# logger = logging.getLogger('/root/develop/log/chh-craler.log') -logger.add('/root/develop/log/chh-craler.log', level='INFO') +# Configure logger - use local path for macOS +import os +log_dir = './logs' +if not os.path.exists(log_dir): + os.makedirs(log_dir) +logger.add('./logs/chh-crawler.log', level='INFO') # connect to redis with password @@ -45,22 +54,53 @@ def url_shorten(url): else: return url +def setup_driver(): + """Configure and initialize Chrome WebDriver""" + chrome_options = Options() + chrome_options.add_argument('--headless') # Run in headless mode + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option('useAutomationExtension', False) + + # Set user agent + user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + chrome_options.add_argument(f'user-agent={user_agent}') + chrome_options.add_argument('--window-size=1920,1080') + + driver = webdriver.Chrome(options=chrome_options) + return driver + def crawler(): - # get article list in html div class name = "acon cl" + # Initialize Selenium WebDriver home_url = 'https://www.chiphell.com/' - # a normal chrome user agent - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 8_6_7) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/51.0.2863.218 Safari/601'} - # get the html page - r = requests.get(home_url, headers=headers) - # use BeautifulSoup to parse the html page - soup = BeautifulSoup(r.text, features="html.parser") - # find the div class name = "acon cl" in the last div name = "chiphell_box cl" - div = soup.find_all('div', class_='chiphell_box cl')[-1] - div = div.find('div', class_='acon cl') - # articles are in the ul div name = "threadulid" - ul = div.find('ul', id='threadulid') - # find all the li tags - li_list = ul.find_all('li') + driver = setup_driver() + + try: + logger.info(f"Starting to crawl {home_url}") + driver.get(home_url) + + # Wait for page to load + time.sleep(3) + + # Find all chiphell_box elements and get the last one + boxes = driver.find_elements(By.CLASS_NAME, 'chiphell_box') + if not boxes: + logger.error("No chiphell_box elements found") + return + + last_box = boxes[-1] + + # Find the acon div within the last box + acon_div = last_box.find_element(By.CLASS_NAME, 'acon') + + # Find the ul with id='threadulid' + ul = acon_div.find_element(By.ID, 'threadulid') + + # Find all li elements + li_list = ul.find_elements(By.TAG_NAME, 'li') # a list item is like: #