dota2-match-calendar/legacy/sync_dota2_matches.py

#!/usr/bin/env python3
"""
Dota 2 Tournament Calendar Sync
Fetches Tier 1 Dota 2 matches from Liquipedia and syncs them to Google Calendar
"""

import requests
from bs4 import BeautifulSoup
from google.oauth2 import service_account
from googleapiclient.discovery import build
from datetime import datetime, timedelta
import pytz
import re
import hashlib
import sys
import argparse

class Dota2CalendarSync:
    def __init__(self, credentials_file='credentials.json', calendar_id='primary'):
        self.credentials_file = credentials_file
        self.calendar_id = calendar_id
        self.service = self._authenticate()

    def _authenticate(self):
        """Authenticate with Google Calendar using service account credentials"""
        try:
            credentials = service_account.Credentials.from_service_account_file(
                self.credentials_file,
                scopes=['https://www.googleapis.com/auth/calendar']
            )
            service = build('calendar', 'v3', credentials=credentials)
            print(f"✓ Successfully authenticated with Google Calendar")
            return service
        except Exception as e:
            print(f"✗ Authentication failed: {e}")
            sys.exit(1)

    def fetch_liquipedia_matches(self):
        """Fetch Tier 1 matches from Liquipedia"""
        url = 'https://liquipedia.net/dota2/Liquipedia:Matches'
        headers = {
            'User-Agent': 'Dota2CalendarSync/1.0 (https://github.com/youruser/dota2-calendar)'
        }

        print(f"Fetching matches from Liquipedia...")

        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'lxml')
            matches = []

            # Main approach: Look for all elements with timestamps
            # These contain the match information
            timestamps = soup.find_all('span', {'data-timestamp': True})

            for timestamp_elem in timestamps:
                # Get the parent div that contains the full match info
                parent = timestamp_elem.find_parent('div')
                if not parent:
                    continue

                text_content = parent.get_text()

                # Check if this is a Tier 1 match
                # Look for TI (The International), Major, Premier, or Tier 1 tournaments
                is_tier1 = any(tier in text_content for tier in [
                    'TI2025', 'The International', 'Major', 'Premier',
                    'Tier 1', 'DreamLeague', 'ESL One', 'PGL Major'
                ])

                if is_tier1:
                    match_data = self._parse_match_from_timestamp_element(parent, timestamp_elem)
                    if match_data:
                        matches.append(match_data)

            # Remove duplicates based on match ID
            unique_matches = {}
            for match in matches:
                if match.get('id'):
                    unique_matches[match['id']] = match

            matches = list(unique_matches.values())

            print(f"✓ Found {len(matches)} Tier 1 matches")
            return matches

        except requests.RequestException as e:
            print(f"✗ Error fetching Liquipedia data: {e}")
            return []

    def _parse_match_from_timestamp_element(self, parent, timestamp_elem):
        """Parse match data from an element containing a timestamp"""
        try:
            match_data = {}

            # Get timestamp
            timestamp = timestamp_elem.get('data-timestamp')
            if timestamp:
                match_data['datetime'] = datetime.fromtimestamp(int(timestamp), tz=pytz.UTC)
            else:
                return None

            # Parse the text content
            text = parent.get_text()

            # Extract teams and format using improved regex
            # The format from Liquipedia is like: "XGvs(Bo3)FalconsTI2025"
            vs_pattern = r'([A-Za-z0-9\s\.\-_]+?)vs\(?(Bo\d)\)?([A-Za-z0-9\s\.\-_]+?)(?:TI2025|Round|Playoff|Group|\+|$)'
            match = re.search(vs_pattern, text)

            if not match:
                # Try alternative pattern without format
                vs_pattern = r'([A-Za-z0-9\s\.\-_]+?)vs([A-Za-z0-9\s\.\-_]+?)(?:TI2025|Round|Playoff|Group|\+|$)'
                match = re.search(vs_pattern, text)

            if match:
                team1 = match.group(1).strip()
                if len(match.groups()) > 2:
                    format_str = match.group(2)
                    team2 = match.group(3).strip()
                else:
                    format_str = None
                    team2 = match.group(2).strip()

                # Clean up team names - remove any date/time remnants
                team1 = re.sub(r'^.*CEST?', '', team1).strip()

                if team1 and team2:
                    match_data['team1'] = self._clean_team_name(team1)
                    match_data['team2'] = self._clean_team_name(team2)

                    if format_str and format_str.startswith('Bo'):
                        match_data['format'] = format_str

            # Extract tournament
            # Look for TI2025 or other tournament indicators
            if 'TI2025' in text:
                match_data['tournament'] = 'The International 2025'
                # Also extract round info if present
                round_match = re.search(r'Round\s+\d+', text)
                if round_match:
                    match_data['tournament'] += f" - {round_match.group(0)}"
            elif 'DreamLeague' in text:
                match_data['tournament'] = 'DreamLeague'
            elif 'ESL' in text:
                match_data['tournament'] = 'ESL'
            elif 'Major' in text:
                # Try to extract full major name
                major_match = re.search(r'[\w\s]+Major', text)
                if major_match:
                    match_data['tournament'] = major_match.group(0).strip()

            # Only return if we have valid teams
            if 'team1' in match_data and 'team2' in match_data:
                match_data['id'] = self._generate_match_id(match_data)
                return match_data

        except Exception as e:
            pass

        return None

    def _extract_match_from_infobox(self, box):
        """Extract match data from an infobox element"""
        try:
            match_data = {}

            # Extract teams
            team_spans = box.find_all('span', {'class': re.compile(r'team-template|team-name')})
            if len(team_spans) >= 2:
                match_data['team1'] = self._clean_team_name(team_spans[0].get_text())
                match_data['team2'] = self._clean_team_name(team_spans[1].get_text())

            # Extract tournament
            tournament_link = box.find('a', href=re.compile(r'/dota2/[^#]+'))
            if tournament_link:
                match_data['tournament'] = tournament_link.get_text().strip()

            # Extract datetime
            timer = box.find('span', {'class': 'timer-object', 'data-timestamp': True})
            if timer:
                timestamp = timer.get('data-timestamp')
                match_data['datetime'] = datetime.fromtimestamp(int(timestamp), tz=pytz.UTC)

            # Extract format
            format_text = box.find(string=re.compile(r'Bo\d'))
            if format_text:
                match_data['format'] = format_text.strip()

            if 'team1' in match_data and 'team2' in match_data:
                match_data['id'] = self._generate_match_id(match_data)
                return match_data

        except Exception as e:
            pass

        return None

    def _extract_match_from_row(self, row):
        """Extract match data from a table row"""
        try:
            cells = row.find_all('td')
            if len(cells) < 2:
                return None

            match_data = {}

            # Try to extract date/time from first cell
            if cells[0]:
                timer = cells[0].find('span', {'class': 'timer-object', 'data-timestamp': True})
                if timer:
                    timestamp = timer.get('data-timestamp')
                    match_data['datetime'] = datetime.fromtimestamp(int(timestamp), tz=pytz.UTC)

            # Extract teams (usually in adjacent cells)
            team_cells = []
            for cell in cells:
                team_elem = cell.find('span', {'class': re.compile(r'team')})
                if team_elem:
                    team_cells.append(team_elem)

            if len(team_cells) >= 2:
                match_data['team1'] = self._clean_team_name(team_cells[0].get_text())
                match_data['team2'] = self._clean_team_name(team_cells[1].get_text())

            # Look for tournament info
            for cell in cells:
                link = cell.find('a', href=re.compile(r'/dota2/[^#]+'))
                if link and 'team' not in link.get('href', ''):
                    match_data['tournament'] = link.get_text().strip()
                    break

            if 'team1' in match_data and 'team2' in match_data:
                match_data['id'] = self._generate_match_id(match_data)
                return match_data

        except Exception:
            pass

        return None

    def _extract_match_with_timer(self, parent, timer):
        """Extract match data when we have a timer element"""
        try:
            match_data = {}

            # Get datetime from timer
            timestamp = timer.get('data-timestamp')
            match_data['datetime'] = datetime.fromtimestamp(int(timestamp), tz=pytz.UTC)

            # Extract teams
            team_elems = parent.find_all('span', {'class': re.compile(r'team')})
            if len(team_elems) >= 2:
                match_data['team1'] = self._clean_team_name(team_elems[0].get_text())
                match_data['team2'] = self._clean_team_name(team_elems[1].get_text())

            # Extract tournament
            tournament_link = parent.find('a', href=re.compile(r'/dota2/[^#]+'))
            if tournament_link:
                match_data['tournament'] = tournament_link.get_text().strip()

            if 'team1' in match_data and 'team2' in match_data:
                match_data['id'] = self._generate_match_id(match_data)
                return match_data

        except Exception:
            pass

        return None

    def _clean_team_name(self, name):
        """Clean and normalize team name"""
        # Remove extra whitespace and common suffixes
        name = re.sub(r'\s+', ' ', name).strip()
        name = re.sub(r'\s*\(.*?\)\s*$', '', name)  # Remove parenthetical info
        return name

    def _generate_match_id(self, match_data):
        """Generate a unique ID for a match"""
        # Use teams and datetime if available, otherwise use what we have
        id_parts = []

        if 'team1' in match_data:
            id_parts.append(match_data['team1'])
        if 'team2' in match_data:
            id_parts.append(match_data['team2'])
        if 'datetime' in match_data:
            id_parts.append(str(match_data['datetime'].date()))
        if 'tournament' in match_data:
            id_parts.append(match_data['tournament'])

        unique_string = '_'.join(id_parts)
        return hashlib.md5(unique_string.encode()).hexdigest()[:16]

    def get_existing_events(self, days_ahead=30):
        """Get existing Dota 2 events from Google Calendar"""
        try:
            now = datetime.utcnow()
            time_min = now.isoformat() + 'Z'
            time_max = (now + timedelta(days=days_ahead)).isoformat() + 'Z'

            print(f"Checking existing events in calendar...")

            events_result = self.service.events().list(
                calendarId=self.calendar_id,
                timeMin=time_min,
                timeMax=time_max,
                maxResults=200,
                singleEvents=True,
                orderBy='startTime'
            ).execute()

            events = events_result.get('items', [])

            # Filter for Dota 2 events and extract IDs
            dota_events = {}
            for event in events:
                if 'Dota 2' in event.get('summary', ''):
                    description = event.get('description', '')
                    # Extract ID from description
                    id_match = re.search(r'ID:\s*([a-f0-9]+)', description)
                    if id_match:
                        dota_events[id_match.group(1)] = event

            print(f"✓ Found {len(dota_events)} existing Dota 2 events")
            return dota_events

        except Exception as e:
            print(f"✗ Error fetching calendar events: {e}")
            return {}

    def create_calendar_event(self, match_data):
        """Create a Google Calendar event for a match"""
        # Build event summary
        team1 = match_data.get('team1', 'TBD')
        team2 = match_data.get('team2', 'TBD')
        tournament = match_data.get('tournament', '')

        if tournament:
            summary = f"Dota 2 - {tournament}: {team1} vs {team2}"
        else:
            summary = f"Dota 2: {team1} vs {team2}"

        # Build description
        description_parts = []
        if tournament:
            description_parts.append(f"Tournament: {tournament}")
        description_parts.append(f"Match: {team1} vs {team2}")
        if 'format' in match_data:
            description_parts.append(f"Format: {match_data['format']}")
        description_parts.append(f"ID: {match_data['id']}")
        description_parts.append("\nSource: Liquipedia")

        description = '\n'.join(description_parts)

        # Set start and end times
        start_time = match_data.get('datetime', datetime.now(pytz.UTC))
        # Estimate match duration based on format
        duration = 2  # Default 2 hours
        if 'format' in match_data:
            if 'Bo5' in match_data['format']:
                duration = 4
            elif 'Bo3' in match_data['format']:
                duration = 3
            elif 'Bo1' in match_data['format']:
                duration = 1

        end_time = start_time + timedelta(hours=duration)

        event = {
            'summary': summary,
            'description': description,
            'start': {
                'dateTime': start_time.isoformat(),
                'timeZone': 'UTC',
            },
            'end': {
                'dateTime': end_time.isoformat(),
                'timeZone': 'UTC',
            },
            'reminders': {
                'useDefault': False,
                'overrides': [
                    {'method': 'popup', 'minutes': 30},
                ],
            },
            'colorId': '9',  # Blue color for Dota 2 events
        }

        return event

    def sync_matches_to_calendar(self, dry_run=False):
        """Main sync function"""
        print("\n" + "="*50)
        print("Starting Dota 2 Calendar Sync")
        print("="*50 + "\n")

        # Fetch matches from Liquipedia
        matches = self.fetch_liquipedia_matches()

        if not matches:
            print("No matches found to sync")
            return

        # Filter for future matches only
        now = datetime.now(pytz.UTC)
        future_matches = [m for m in matches if m.get('datetime', now) >= now]

        print(f"Filtered to {len(future_matches)} future matches")

        if not future_matches:
            print("No future matches to sync")
            return

        # Get existing events
        existing_events = self.get_existing_events()

        # Process each match
        added_count = 0
        skipped_count = 0
        error_count = 0

        print("\nProcessing matches...")
        print("-" * 30)

        for match in future_matches:
            match_id = match.get('id')
            team1 = match.get('team1', 'TBD')
            team2 = match.get('team2', 'TBD')
            match_time = match.get('datetime', now)

            if not match_id:
                continue

            if match_id in existing_events:
                print(f"⊘ Skipping (exists): {team1} vs {team2}")
                skipped_count += 1
            else:
                if dry_run:
                    print(f"◯ Would add: {team1} vs {team2} at {match_time.strftime('%Y-%m-%d %H:%M UTC')}")
                    added_count += 1
                else:
                    try:
                        event = self.create_calendar_event(match)
                        self.service.events().insert(
                            calendarId=self.calendar_id,
                            body=event
                        ).execute()
                        print(f"✓ Added: {team1} vs {team2} at {match_time.strftime('%Y-%m-%d %H:%M UTC')}")
                        added_count += 1
                    except Exception as e:
                        print(f"✗ Error adding {team1} vs {team2}: {e}")
                        error_count += 1

        # Summary
        print("\n" + "="*50)
        print("Sync Summary")
        print("="*50)
        print(f"✓ Added: {added_count} matches")
        print(f"⊘ Skipped: {skipped_count} matches (already exist)")
        if error_count > 0:
            print(f"✗ Errors: {error_count} matches")

        if dry_run:
            print("\n⚠ DRY RUN - No actual changes were made")

        print("\n✓ Sync complete!")

def main():
    parser = argparse.ArgumentParser(
        description='Sync Dota 2 Tier 1 matches from Liquipedia to Google Calendar'
    )
    parser.add_argument(
        '--calendar-id',
        default='primary',
        help='Google Calendar ID (default: primary). Use email address for specific calendar.'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Perform a dry run without actually creating events'
    )
    parser.add_argument(
        '--credentials',
        default='credentials.json',
        help='Path to Google service account credentials JSON file'
    )

    args = parser.parse_args()

    # Important notice
    print("\n" + "!"*60)
    print("IMPORTANT: Before using this script:")
    print("1. Share your Google Calendar with the service account")
    print("   Service Account Email: calendar-bot@tunpok.iam.gserviceaccount.com")
    print("2. Grant 'Make changes to events' permission")
    print("3. Use your calendar email as --calendar-id parameter")
    print("!"*60 + "\n")

    # Initialize and run sync
    try:
        sync = Dota2CalendarSync(
            credentials_file=args.credentials,
            calendar_id=args.calendar_id
        )

        sync.sync_matches_to_calendar(dry_run=args.dry_run)

    except KeyboardInterrupt:
        print("\n\nSync cancelled by user")
        sys.exit(0)
    except Exception as e:
        print(f"\n✗ Fatal error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()