import pandas as pd
import numpy as np

class FeatureEngineer:
    def __init__(self):
        pass

    def prepare_features(self, matches_df: pd.DataFrame) -> pd.DataFrame:
        """
        Prepares features for training or inference.
        matches_df should contain: id_jogo, time_casa, time_fora, placar_casa, placar_fora, data_jogo
        """
        if matches_df.empty:
            return pd.DataFrame()

        df = matches_df.sort_values('data_jogo').copy()
        
        # 1. Basic Features
        # We need to compute rolling averages for each team efficiently
        # This is tricky with a single DataFrame where teams are in two columns (home/away)
        # So we reshape to a team-centric view
        
        team_stats = self._calculate_team_stats(df)
        
        # Merge back to the main dataframe
        df = df.merge(team_stats, left_on=['time_casa', 'id_jogo'], right_on=['team', 'id_jogo'], how='left', suffixes=('', '_home'))
        df = df.drop(columns=['team'])
        df = df.rename(columns={c: f'home_{c}' for c in team_stats.columns if c not in ['team', 'id_jogo']})
        
        df = df.merge(team_stats, left_on=['time_fora', 'id_jogo'], right_on=['team', 'id_jogo'], how='left', suffixes=('', '_away'))
        df = df.drop(columns=['team'])
        df = df.rename(columns={c: f'away_{c}' for c in team_stats.columns if c not in ['team', 'id_jogo']})
        
        # Calculate H2H features (simple version)
        # For now, we rely on the rolling features which capture current form
        
        # Interaction features
        df['diff_goals_scored_recent'] = df['home_ema_goals_scored_5'] - df['away_ema_goals_scored_5']
        df['diff_goals_conceded_recent'] = df['home_ema_goals_conceded_5'] - df['away_ema_goals_conceded_5']
        
        # Drop rows with NaN (initial games where history is not available)
        # For inference, we handle NaNs by using overall averages if needed
        return df

    def _calculate_team_stats(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculates rolling stats for every team at every point in time (match).
        """
        # Create a long format: one row per team per match
        home_df = df[['id_jogo', 'data_jogo', 'time_casa', 'placar_casa', 'placar_fora']].rename(
            columns={'time_casa': 'team', 'placar_casa': 'goals_scored', 'placar_fora': 'goals_conceded'}
        )
        home_df['is_home'] = 1
        
        away_df = df[['id_jogo', 'data_jogo', 'time_fora', 'placar_fora', 'placar_casa']].rename(
            columns={'time_fora': 'team', 'placar_fora': 'goals_scored', 'placar_casa': 'goals_conceded'}
        )
        away_df['is_home'] = 0
        
        team_df = pd.concat([home_df, away_df]).sort_values(['team', 'data_jogo'])
        
        # Calculate rolling metrics
        # We must shift(1) because we can't use the current match result to predict itself!
        
        grouped = team_df.groupby('team')
        
        features = pd.DataFrame()
        features['id_jogo'] = team_df['id_jogo']
        features['team'] = team_df['team']
        
        # Win streak / Form points (3 for win, 1 for draw, 0 for loss)
        def calculate_points(row):
            if row['goals_scored'] > row['goals_conceded']: return 3
            if row['goals_scored'] == row['goals_conceded']: return 1
            return 0
            
        team_df['points'] = team_df.apply(calculate_points, axis=1)

        # --- Enhanced Features ---
        
        # Exponential Moving Averages (EMA) - Gives more weight to recent games
        # EMA span=5 is roughly equivalent to SMA window=5 but smoother
        
        # Helper for transforming with shift
        def shift_transform(series, func):
            return series.shift(1).transform(func)

        # IMPORTANT: shift(1) means "use data from previous match".
        # If this is the FIRST match, shift(1) is NaN.
        # But for global stats, we might want to use the expanding mean INCLUDING the current match?
        # No, that would be data leakage. We must predict match N using data from 0...N-1.
        
        # So for the very first match of a team, all shifted features will be NaN.
        # We fill them with 0 later.
        
        features['ema_goals_scored_5'] = grouped['goals_scored'].transform(lambda x: x.shift(1).ewm(span=5, min_periods=1).mean())
        features['ema_goals_conceded_5'] = grouped['goals_conceded'].transform(lambda x: x.shift(1).ewm(span=5, min_periods=1).mean())
        features['ema_points_5'] = grouped['points'].transform(lambda x: x.shift(1).ewm(span=5, min_periods=1).mean())
        
        # Long term form (10 games)
        features['ema_goals_scored_10'] = grouped['goals_scored'].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())
        features['ema_goals_conceded_10'] = grouped['goals_conceded'].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())
        features['ema_points_10'] = grouped['points'].transform(lambda x: x.shift(1).ewm(span=10, min_periods=1).mean())

        # Volatility (Standard Deviation of Goals Scored) - Measures consistency
        features['std_goals_scored_10'] = grouped['goals_scored'].transform(lambda x: x.shift(1).rolling(window=10, min_periods=2).std())
        
        # Global Rating (Expanding Mean) - Captures long-term team strength
        # This tells the model "Who is this team historically?"
        features['global_goals_scored'] = grouped['goals_scored'].transform(lambda x: x.shift(1).expanding().mean())
        features['global_goals_conceded'] = grouped['goals_conceded'].transform(lambda x: x.shift(1).expanding().mean())
        
        # Fill NaNs with 0 (for first few games)
        # But instead of 0, maybe use the global average of the whole dataset?
        # For now 0 is safer than leakage.
        
        # Better Fill Strategy:
        # If a team has played 1 game, ema_5 is NaN because shift(1) is NaN? No.
        # Match 1: shift(1) is NaN.
        # Match 2: shift(1) is Match 1 stats.
        
        # The sample row 1002 in debug log showed 0.0 for everything.
        # This implies that for that specific match, the history was empty OR NaNs were filled with 0.
        # If it's an early match in the dataset, this is expected.
        
        features = features.fillna(0)
        
        return features

    def get_features_for_match(self, home_team: str, away_team: str, historical_matches: pd.DataFrame) -> pd.DataFrame:
        """
        Generates a single row of features for a future match prediction.
        """
        # We process the whole history to get the latest state for both teams
        # In a production system with millions of rows, we would optimize this to only fetch recent history
        # but for simplicity and robustness we re-calculate the latest state.
        
        processed_df = self.prepare_features(historical_matches)
        
        # Get latest stats for home team
        home_stats = processed_df[processed_df['time_casa'] == home_team].iloc[-1] if not processed_df[processed_df['time_casa'] == home_team].empty else None
        if home_stats is None:
             # Fallback if team never played home? check away games
             # For now, return empty or default
             return None

        # Get latest stats for away team
        away_stats = processed_df[processed_df['time_fora'] == away_team].iloc[-1] if not processed_df[processed_df['time_fora'] == away_team].empty else None
        
        # Construct the feature vector
        # Note: The features in 'processed_df' are for the match that happened. 
        # We need the stats *after* that match to predict the *next* one.
        # However, our prepare_features already shifts(1), so the row for match N contains features based on N-1, N-2...
        # So we can just take the most recent calculated features for the next hypothetical match.
        
        # Actually, prepare_features returns features for the *historical* matches.
        # To predict a NEW match, we need to calculate the rolling window including the very last match.
        
        # Optimized approach:
        # 1. Get last 10 games for Home Team
        # 2. Get last 10 games for Away Team
        # 3. Compute the means manually
        
        return self._manual_feature_construction(home_team, away_team, historical_matches)

    def _manual_feature_construction(self, home_team, away_team, df):
        # Helper to get last N stats
        def get_team_recent_stats(team, n=5, use_ema=True):
            team_games = df[(df['time_casa'] == team) | (df['time_fora'] == team)].sort_values('data_jogo', ascending=False)
            # For EMA, we need chronological order
            # For "recent stats", we just need the last N games
            
            # Global Rating Calculation (Full History)
            global_scored = 0
            global_conceded = 0
            
            if not team_games.empty:
                scored = []
                conceded = []
                for _, r in team_games.iterrows():
                    if r['time_casa'] == team:
                        scored.append(r['placar_casa'])
                        conceded.append(r['placar_fora'])
                    else:
                        scored.append(r['placar_fora'])
                        conceded.append(r['placar_casa'])
                global_scored = np.mean(scored)
                global_conceded = np.mean(conceded)

            # Filter for recent window
            recent_games = team_games.head(n * 2) # Get more for EMA warm-up
            
            if recent_games.empty:
                return {
                    f'ema_goals_scored_{n}': 0, 
                    f'ema_goals_conceded_{n}': 0, 
                    f'ema_points_{n}': 0, 
                    f'std_goals_scored_{n}': 0,
                    'global_goals_scored': global_scored,
                    'global_goals_conceded': global_conceded
                }
            
            # Reconstruct team-centric dataframe for EMA
            data = []
            for _, row in recent_games.iterrows():
                if row['time_casa'] == team:
                    data.append({'goals_scored': row['placar_casa'], 'goals_conceded': row['placar_fora']})
                else:
                    data.append({'goals_scored': row['placar_fora'], 'goals_conceded': row['placar_casa']})
            
            # Reverse to chronological order for EMA
            temp_df = pd.DataFrame(data).iloc[::-1]
            
            def calc_points(row):
                if row['goals_scored'] > row['goals_conceded']: return 3
                if row['goals_scored'] == row['goals_conceded']: return 1
                return 0
            temp_df['points'] = temp_df.apply(calc_points, axis=1)

            # Calculate EMA/Std on this small window
            # We take the LAST value of the EMA series
            ema_gs = temp_df['goals_scored'].ewm(span=n, min_periods=1).mean().iloc[-1]
            ema_gc = temp_df['goals_conceded'].ewm(span=n, min_periods=1).mean().iloc[-1]
            ema_pt = temp_df['points'].ewm(span=n, min_periods=1).mean().iloc[-1]
            
            # Std Dev
            std_gs = temp_df['goals_scored'].rolling(window=n).std().iloc[-1] if len(temp_df) >= n else 0
            
            return {
                f'ema_goals_scored_{n}': ema_gs,
                f'ema_goals_conceded_{n}': ema_gc,
                f'ema_points_{n}': ema_pt,
                f'std_goals_scored_{n}': std_gs if n == 10 else 0, # Only calc std for 10
                'global_goals_scored': global_scored, # This is the same regardless of window size, but we return it
                'global_goals_conceded': global_conceded
            }

        home_5 = get_team_recent_stats(home_team, 5)
        home_10 = get_team_recent_stats(home_team, 10)
        
        away_5 = get_team_recent_stats(away_team, 5)
        away_10 = get_team_recent_stats(away_team, 10)
        
        # Combine into a single dict with correct prefixes
        features = {}
        # Explicitly include std features to ensure variance capture
        for k, v in home_5.items(): 
            if k == 'global_goals_scored' or k == 'global_goals_conceded':
                 features[f'home_{k}'] = v # Only once per team
            else:
                 features[f'home_{k}'] = v

        for k, v in home_10.items(): 
            if k != 'global_goals_scored' and k != 'global_goals_conceded':
                features[f'home_{k}'] = v

        for k, v in away_5.items(): 
            if k == 'global_goals_scored' or k == 'global_goals_conceded':
                 features[f'away_{k}'] = v
            else:
                 features[f'away_{k}'] = v

        for k, v in away_10.items(): 
            if k != 'global_goals_scored' and k != 'global_goals_conceded':
                features[f'away_{k}'] = v
        
        # Clean up redundant global features (they will appear twice because of the loop)
        # We only need home_global_goals_scored once
        # The loop above overwrites it, which is fine as the value is the same (calculated from full history)
        
        # Interactions
        features['diff_goals_scored_recent'] = features['home_ema_goals_scored_5'] - features['away_ema_goals_scored_5']
        features['diff_goals_conceded_recent'] = features['home_ema_goals_conceded_5'] - features['away_ema_goals_conceded_5']
        
        return pd.DataFrame([features])
