import pandas as pd
import numpy as np

# Load the data
# Assuming your file is named 'log_data.csv'
try:
    df = pd.read_csv('/g/data/xv83/users/bxn599/qc_20i_concatenated/ccam_access-cm2_historical_aus-20i_22km/stats/stats.mon_tas.log')
except FileNotFoundError:
    # This part is just to create the dataframe for demonstration 
    # if you run this script without the file present.
    from io import StringIO
    data = """filename,realization,validtime,varname,level,fieldmean,fieldmin,fieldmax,nummasked,numnan"""
    # ... (truncated for brevity)
    df = pd.read_csv(StringIO(data))

# List of numeric fields to analyze
fields = ['fieldmean', 'fieldmin', 'fieldmax', 'nummasked', 'numnan']

print("### Data Summary Statistics ###")
stats = df[fields].agg(['mean', 'std']).transpose()
print(stats)
print("-" * 40)

def detect_outliers(dataframe, columns, threshold=3):
    outliers_found = False
    
    for col in columns:
        col_mean = dataframe[col].mean()
        col_std = dataframe[col].std()
        
        # If std is 0 (e.g., all numnan are 0), skip to avoid division by zero
        if col_std == 0:
            continue
            
        # Calculate Z-score: (x - mean) / std
        # We use absolute value to find both high and low extremes
        outlier_mask = np.abs((dataframe[col] - col_mean) / col_std) > threshold
        
        if outlier_mask.any():
            outliers_found = True
            print(f"\n[!] Anomalies detected in: {col}")
            # Display the rows where the anomaly exists
            print(dataframe.loc[outlier_mask, ['validtime', col]])
            
    if not outliers_found:
        print("\nNo values exceeded the 3-standard-deviation threshold.")

detect_outliers(df, fields)
