import pandas as pd
import numpy as np
import glob
import os

# Define the base path
base_path = "/g/data/xv83/users/bxn599/qc_20i_concatenated/"
search_pattern = os.path.join(base_path, "*/stats/*.log")

# Find all log files
log_files = glob.glob(search_pattern)

if not log_files:
    print(f"No log files found in {search_pattern}")
    exit()

def process_log(file_path):
    print(f"\n{'='*80}")
    # Extract model name from path for context
    model_name = file_path.split('/')[-3]
    print(f"MODEL: {model_name} | FILE: {os.path.basename(file_path)}")
    print(f"{'='*80}")
    
    try:
        # Read the CSV
        df = pd.read_csv(file_path)
        
        # Clean column names (sometimes CSVs have leading/trailing whitespace)
        df.columns = df.columns.str.strip()
        
        # Ensure validtime is a datetime object
        df['validtime'] = pd.to_datetime(df['validtime'])
        df['month'] = df['validtime'].dt.month
        
        fields = ['fieldmean', 'fieldmin', 'fieldmax', 'nummasked', 'numnan']
        
        for col in fields:
            if col not in df.columns:
                continue
            
            # Convert to numeric, forcing errors to NaN (handles weird strings in logs)
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
            # Calculate seasonal stats
            # We use transform to keep the index aligned with the original dataframe
            m_mean = df.groupby('month')[col].transform('mean')
            m_std = df.groupby('month')[col].transform('std')
            
            # Calculate Z-score
            # Fillna(0) ensures that if std is 0 (all values same), z-score is 0, not NaN
            z_scores = np.abs((df[col] - m_mean) / m_std.replace(0, np.nan)).fillna(0)
            
            # Identify indices where z_score > 3
            outlier_indices = df.index[z_scores > 3].tolist()
            
            # Summary Print
            print(f"Field: {col:<12} | Mean: {df[col].mean():>10.4f} | Std: {df[col].std():>10.4f}")
            
            if outlier_indices:
                print(f"  [!] Found {len(outlier_indices)} anomalies:")
                for idx in outlier_indices:
                    row = df.loc[idx]
                    print(f"      Time: {row['validtime']} | Value: {row[col]:>10.4f} (Z: {z_scores[idx]:.2f})")
                    
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Loop through every found log file
for log in log_files:
    process_log(log)
