Climate Data Analysis Workflows

This guide provides comprehensive workflows for analyzing climate data using Monet Stats. These workflows cover common scenarios in climate model evaluation, atmospheric data analysis, and environmental monitoring.

Overview

Climate data analysis requires specialized statistical approaches due to:

Multi-scale temporal patterns: Daily, seasonal, and interannual variability
Spatial heterogeneity: Regional and local climate variations
Complex dependencies: Autocorrelation, teleconnections, and non-stationarity
Extreme value analysis: Rare events and long-term trends
Multi-model ensembles: Uncertainty quantification and model comparison

Workflow 1: Temperature Trend Analysis

Objective

Analyze long-term temperature trends and model performance across different time scales.

import numpy as np
import pandas as pd
import monet_stats as ms
import matplotlib.pyplot as plt

# Generate synthetic temperature data (1980-2020)
years = np.arange(1980, 2021)
n_years = len(years)

# Observed temperatures with trend and natural variability
trend = 0.02 * (years - 1980)  # 0.02°C per year warming
observed = 15.0 + trend + np.random.normal(0, 0.5, n_years)

# Model 1: Good performance
model1 = observed + np.random.normal(0, 0.3, n_years)

# Model 2: Biased with poor trend capture
model2 = observed + 1.0 + np.random.normal(0, 0.8, n_years)

# Create DataFrame
temp_data = pd.DataFrame({
    'year': years,
    'observed': observed,
    'model1': model1,
    'model2': model2
})

Step 1: Annual Analysis

# Calculate annual metrics
annual_metrics = temp_data.apply(
    lambda x: pd.Series({
        'RMSE': ms.RMSE(x['observed'], x['model1']),
        'R2': ms.R2(x['observed'], x['model1']),
        'MB': ms.MB(x['observed'], x['model1']),
        'NSE': ms.NSE(x['observed'], x['model1'])
    }),
    axis=1
)

print("Annual Performance Metrics (Model 1):")
print(annual_metrics.describe())

Step 2: Decadal Analysis

# Group by decade
temp_data['decade'] = (temp_data['year'] // 10) * 10
decadal_metrics = temp_data.groupby('decade').apply(
    lambda x: pd.Series({
        'RMSE': ms.RMSE(x['observed'], x['model1']),
        'R2': ms.R2(x['observed'], x['model1']),
        'Trend_RMSE': ms.RMSE(
            np.polyfit(x['year'] - x['year'].iloc[0], x['observed'], 1)[0],
            np.polyfit(x['year'] - x['year'].iloc[0], x['model1'], 1)[0]
        )
    })
).round(4)

print("\nDecadal Performance Trends:")
print(decadal_metrics)

Step 3: Seasonal Analysis

# Add seasonal component
np.random.seed(42)
temp_data['month'] = np.random.choice(range(1, 13), size=len(temp_data))
temp_data['season'] = temp_data['month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
})

# Seasonal performance analysis
seasonal_performance = temp_data.groupby('season').apply(
    lambda x: pd.Series({
        'RMSE': ms.RMSE(x['observed'], x['model1']),
        'R2': ms.R2(x['observed'], x['model1']),
        'MB': ms.MB(x['observed'], x['model1']),
        'MAE': ms.MAE(x['observed'], x['model1']),
        'NMB': ms.NMB(x['observed'], x['model1'])
    })
)

print("\nSeasonal Performance Analysis:")
print(seasonal_performance)

Workflow 2: Precipitation Extremes Analysis

Objective

Evaluate model performance for extreme precipitation events and return periods.

# Generate precipitation data with heavy-tailed distribution
np.random.seed(123)
n_years = 50  # 50 years of daily data

# Daily precipitation (mm/day)
precip_daily = np.random.gamma(0.5, 5, n_years * 365)
precip_daily = np.maximum(precip_daily, 0)  # Ensure non-negative

# Add extreme events (rare heavy precipitation)
extreme_events = np.random.choice(len(precip_daily),
                                 size=int(0.001 * len(precip_daily)),
                                 replace=False)
precip_daily[extreme_events] += np.random.exponential(20, len(extreme_events))

# Reshape to yearly
yearly_max = precip_daily.reshape(n_years, 365).max(axis=1)

# Model simulations with different performance
model1_good = yearly_max + np.random.normal(0, 2, n_years)
model2_conservative = yearly_max * 0.8 + np.random.normal(0, 1, n_years)
model3_overpredict = yearly_max * 1.5 + np.random.normal(0, 3, n_years)

# Create DataFrame
precip_data = pd.DataFrame({
    'year': np.arange(1970, 1970 + n_years),
    'observed': yearly_max,
    'model1': model1_good,
    'model2': model2_conservative,
    'model3': model3_overpredict
})

Step 1: Extreme Value Analysis

# Define extreme thresholds
thresholds = [95, 99]  # 95th and 99th percentiles

for threshold in thresholds:
    print(f"\nAnalysis for {threshold}th percentile threshold:")

    # Binary extreme events
    obs_extreme = (precip_data['observed'] >= np.percentile(precip_data['observed'], threshold)).astype(int)

    for model_name in ['model1', 'model2', 'model3']:
        mod_extreme = (precip_data[model_name] >= np.percentile(precip_data['observed'], threshold)).astype(int)

        # Contingency table metrics
        pod = ms.POD(obs_extreme, mod_extreme, threshold=0.5)
        far = ms.FAR(obs_extreme, mod_extreme, threshold=0.5)
        csi = ms.CSI(obs_extreme, mod_extreme, threshold=0.5)
        hss = ms.HSS(obs_extreme, mod_extreme, threshold=0.5)

        print(f"  {model_name}: POD={pod:.3f}, FAR={far:.3f}, CSI={csi:.3f}, HSS={hss:.3f}")

Step 2: Return Period Analysis

def calculate_return_period(values, return_years):
    """Calculate return period values from observed data"""
    sorted_values = np.sort(values)[::-1]
    n = len(values)
    exceedance_probs = np.arange(1, n + 1) / (n + 1)
    return_periods = 1 / exceedance_probs

    # Interpolate to desired return periods
    from scipy.interpolate import interp1d
    interp_func = interp1d(return_periods, sorted_values, bounds_error=False, fill_value='extrapolate')

    return interp_func(return_years)

# Calculate return periods for different models
return_years = [2, 5, 10, 25, 50, 100]
return_period_comparison = pd.DataFrame({
    'Return_Year': return_years
})

for col in ['observed', 'model1', 'model2', 'model3']:
    return_period_comparison[col] = calculate_return_period(precip_data[col], return_years)

print("\nReturn Period Analysis (mm):")
print(return_period_comparison.set_index('Return_Year'))

Step 3: Bias Analysis for Different Intensity Ranges

# Define precipitation intensity categories
def categorize_precipitation(precip):
    """Categorize precipitation by intensity"""
    categories = []
    for p in precip:
        if p == 0:
            categories.append('No rain')
        elif p < 2:
            categories.append('Light')
        elif p < 10:
            categories.append('Moderate')
        elif p < 25:
            categories.append('Heavy')
        else:
            categories.append('Extreme')
    return categories

# Apply categorization
for col in ['observed', 'model1', 'model2', 'model3']:
    precip_data[f'{col}_category'] = categorize_precipitation(precip_data[col])

# Bias analysis by intensity
intensity_bias = []
for category in ['No rain', 'Light', 'Moderate', 'Heavy', 'Extreme']:
    subset = precip_data[precip_data['observed_category'] == category]

    if len(subset) > 0:
        bias = ms.MB(subset['observed'], subset['model1'])
        nmb = ms.NMB(subset['observed'], subset['model1'])

        intensity_bias.append({
            'Intensity': category,
            'Count': len(subset),
            'Mean_Bias': bias,
            'Normalized_Bias': nmb
        })

intensity_bias_df = pd.DataFrame(intensity_bias)
print("\nBias Analysis by Intensity Category:")
print(intensity_bias_df)

Workflow 3: Multi-Model Ensemble Analysis

Objective

Evaluate and combine multiple climate models to improve prediction accuracy.

# Generate synthetic multi-model data
n_models = 10
n_years = 30
years = np.arange(1990, 1990 + n_years)

# Generate ensemble models with different performance characteristics
ensemble_data = pd.DataFrame({'year': years, 'observed': 15 + 0.01 * (years - 1990)})

for i in range(n_models):
    # Each model has different bias and error characteristics
    bias = np.random.normal(0, 0.5)  # Model bias
    error_std = np.random.uniform(0.2, 1.0)  # Model error

    ensemble_data[f'model_{i+1}'] = (
        ensemble_data['observed'] +
        bias +
        np.random.normal(0, error_std, n_years)
    )

Step 1: Model Evaluation

# Evaluate individual models
model_performance = []
for i in range(1, n_models + 1):
    model_col = f'model_{i}'

    performance = {
        'Model': model_col,
        'RMSE': ms.RMSE(ensemble_data['observed'], ensemble_data[model_col]),
        'R2': ms.R2(ensemble_data['observed'], ensemble_data[model_col]),
        'MB': ms.MB(ensemble_data['observed'], ensemble_data[model_col]),
        'NSE': ms.NSE(ensemble_data['observed'], ensemble_data[model_col]),
        'KGE': ms.KGE(ensemble_data['observed'], ensemble_data[model_col])
    }
    model_performance.append(performance)

performance_df = pd.DataFrame(model_performance).sort_values('RMSE')
print("Individual Model Performance:")
print(performance_df)

Step 2: Ensemble Construction

# Simple mean ensemble
ensemble_mean = ensemble_data.filter(like='model_').mean(axis=1)

# Weighted ensemble based on performance
weights = 1 / performance_df['RMSE'].values
weights = weights / weights.sum()

weighted_ensemble = np.zeros(len(ensemble_data))
for i, (model_col, weight) in enumerate(zip(performance_df['Model'], weights)):
    weighted_ensemble += ensemble_data[model_col] * weight

# Add ensembles to dataframe
ensemble_data['ensemble_mean'] = ensemble_mean
ensemble_data['ensemble_weighted'] = weighted_ensemble

# Evaluate ensemble performance
ensemble_performance = pd.DataFrame({
    'System': ['Best_Model', 'Mean_Ensemble', 'Weighted_Ensemble'],
    'RMSE': [
        performance_df.iloc[0]['RMSE'],
        ms.RMSE(ensemble_data['observed'], ensemble_data['ensemble_mean']),
        ms.RMSE(ensemble_data['observed'], ensemble_data['ensemble_weighted'])
    ],
    'R2': [
        performance_df.iloc[0]['R2'],
        ms.R2(ensemble_data['observed'], ensemble_data['ensemble_mean']),
        ms.R2(ensemble_data['observed'], ensemble_data['ensemble_weighted'])
    ]
})

print("\nEnsemble Performance vs Best Individual Model:")
print(ensemble_performance)

Step 3: Spread-Skill Relationship

def calculate_spread_skill(model_data, observed):
    """Calculate spread and skill for ensemble"""
    # Ensemble spread (standard deviation)
    spread = model_data.std(axis=1)

    # Ensemble skill (RMSE of ensemble mean)
    ensemble_mean = model_data.mean(axis=1)
    skill = ms.RMSE(observed, ensemble_mean)

    return spread, skill

# Calculate spread-skill relationship
spread, skill = calculate_spread_skill(
    ensemble_data.filter(like='model_'),
    ensemble_data['observed']
)

# Analyze correlation
spread_skill_corr = ms.pearsonr(spread, skill)[0]
print(f"\nSpread-Skill Correlation: {spread_skill_corr:.3f}")

# Plot spread-skill relationship
plt.figure(figsize=(8, 6))
plt.scatter(spread, skill, alpha=0.6)
plt.xlabel('Ensemble Spread')
plt.ylabel('Ensemble Skill (RMSE)')
plt.title(f'Spread-Skill Relationship (r = {spread_skill_corr:.3f})')
plt.grid(True, alpha=0.3)
plt.show()

Workflow 4: Spatial Climate Downscaling

Objective

Evaluate high-resolution downscaled climate model data against observations.

import numpy as np

# Generate spatial data (e.g., temperature field)
grid_size = 20
x, y = np.meshgrid(range(grid_size), range(grid_size))

# Create spatial pattern with some randomness
observed_spatial = (
    20 + 0.1 * x + 0.05 * y +
    2 * np.sin(2 * np.pi * x / grid_size) * np.cos(2 * np.pi * y / grid_size) +
    np.random.normal(0, 0.5, (grid_size, grid_size))
)

# Model 1: Good spatial representation
model1_spatial = observed_spatial + np.random.normal(0, 0.3, (grid_size, grid_size))

# Model 2: Poor spatial representation (over-smoothed)
from scipy.ndimage import gaussian_filter
model2_spatial = gaussian_filter(observed_spatial + np.random.normal(0, 0.5, (grid_size, grid_size)), sigma=2.0)

Step 1: Point-wise Verification

# Flatten arrays for point-wise analysis
observed_flat = observed_spatial.flatten()
model1_flat = model1_spatial.flatten()
model2_flat = model2_spatial.flatten()

# Point-wise metrics
point_metrics = pd.DataFrame({
    'Metric': ['RMSE', 'MAE', 'R2', 'MB', 'NMB'],
    'Model1': [
        ms.RMSE(observed_flat, model1_flat),
        ms.MAE(observed_flat, model1_flat),
        ms.R2(observed_flat, model1_flat),
        ms.MB(observed_flat, model1_flat),
        ms.NMB(observed_flat, model1_flat)
    ],
    'Model2': [
        ms.RMSE(observed_flat, model2_flat),
        ms.MAE(observed_flat, model2_flat),
        ms.R2(observed_flat, model2_flat),
        ms.MB(observed_flat, model2_flat),
        ms.NMB(observed_flat, model2_flat)
    ]
})

print("Point-wise Spatial Verification:")
print(point_metrics.set_index('Metric'))

Step 2: Spatial Structure Analysis

# Calculate spatial gradients
def calculate_spatial_gradients(field):
    """Calculate gradients in x and y directions"""
    grad_x = np.gradient(field, axis=1)
    grad_y = np.gradient(field, axis=0)
    grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)
    return grad_magnitude

# Spatial structure metrics
obs_grad = calculate_spatial_gradients(observed_spatial)
model1_grad = calculate_spatial_gradients(model1_spatial)
model2_grad = calculate_spatial_gradients(model2_spatial)

# Compare gradient statistics
gradient_stats = pd.DataFrame({
    'Statistic': ['Mean', 'Std', 'Max'],
    'Observed': [obs_grad.mean(), obs_grad.std(), obs_grad.max()],
    'Model1': [model1_grad.mean(), model1_grad.std(), model1_grad.max()],
    'Model2': [model2_grad.mean(), model2_grad.std(), model2_grad.max()]
})

print("\nSpatial Gradient Statistics:")
print(gradient_stats.set_index('Statistic'))

# Spatial verification metrics
fss1 = ms.FSS(observed_spatial, model1_spatial, window=5, threshold=np.percentile(observed_spatial, 75))
fss2 = ms.FSS(observed_spatial, model2_spatial, window=5, threshold=np.percentile(observed_spatial, 75))

sal_s1, sal_a1, sal_l1 = ms.SAL(observed_spatial, model1_spatial)
sal_s2, sal_a2, sal_l2 = ms.SAL(observed_spatial, model2_spatial)

print(f"\nSpatial Verification Metrics:")
print(f"  Model 1 - FSS: {fss1:.3f}, SAL(S={sal_s1:.3f}, A={sal_a1:.3f}, L={sal_l1:.3f})")
print(f"  Model 2 - FSS: {fss2:.3f}, SAL(S={sal_s2:.3f}, A={sal_a2:.3f}, L={sal_l2:.3f})")

Step 3: Spatial Pattern Correlation

# Calculate spatial pattern correlations
from scipy.stats import pearsonr

def spatial_pattern_correlation(obs, mod, window_size=5):
    """Calculate local spatial correlations"""
    correlations = np.zeros_like(obs, dtype=float)

    for i in range(window_size//2, obs.shape[0] - window_size//2):
        for j in range(window_size//2, obs.shape[1] - window_size//2):
            # Extract local window
            obs_window = obs[i-window_size//2:i+window_size//2+1,
                           j-window_size//2:j+window_size//2+1]
            mod_window = mod[i-window_size//2:i+window_size//2+1,
                           j-window_size//2:j+window_size//2+1]

            # Calculate correlation
            if not (np.isnan(obs_window).any() or np.isnan(mod_window).any()):
                corr, _ = pearsonr(obs_window.flatten(), mod_window.flatten())
                correlations[i, j] = corr

    return correlations

# Calculate local correlations
local_corr1 = spatial_pattern_correlation(observed_spatial, model1_spatial)
local_corr2 = spatial_pattern_correlation(observed_spatial, model2_spatial)

print(f"\nLocal Spatial Pattern Correlations:")
print(f"  Model 1 - Mean: {np.nanmean(local_corr1):.3f}, Std: {np.nanstd(local_corr1):.3f}")
print(f"  Model 2 - Mean: {np.nanmean(local_corr2):.3f}, Std: {np.nanstd(local_corr2):.3f}")

Best Practices for Climate Data Analysis

Data Quality Control

def quality_control_analysis(observed, modeled):
    """Perform comprehensive quality control"""

    # Check for data consistency
    qc_results = {}

    # Temporal consistency
    obs_diff = np.diff(observed)
    mod_diff = np.diff(modeled)
    qc_results['temporal_correlation'] = ms.pearsonr(obs_diff, mod_diff)[0]

    # Outlier detection (using IQR method)
    obs_q75, obs_q25 = np.percentile(observed, [75, 25])
    obs_iqr = obs_q75 - obs_q25
    obs_outliers = np.sum((observed < obs_q25 - 1.5 * obs_iqr) |
                          (observed > obs_q75 + 1.5 * obs_iqr))

    mod_q75, mod_q25 = np.percentile(modeled, [75, 25])
    mod_iqr = mod_q75 - mod_q25
    mod_outliers = np.sum((modeled < mod_q25 - 1.5 * mod_iqr) |
                          (modeled > mod_q75 + 1.5 * mod_iqr))

    qc_results['outlier_ratio'] = obs_outliers / mod_outliers if mod_outliers > 0 else np.nan

    return qc_results

# Example usage
qc = quality_control_analysis(observed_temps, model_temps)
print("\nQuality Control Analysis:")
for key, value in qc.items():
    print(f"  {key}: {value:.3f}")

Trend Analysis

def climate_trend_analysis(time_series, time):
    """Analyze climate trends with appropriate statistical methods"""

    # Linear trend
    trend_slope, trend_intercept = np.polyfit(time - time[0], time_series, 1)

    # Mann-Kendall trend test
    def mann_kendall_test(data):
        """Mann-Kendall trend test"""
        n = len(data)
        s = 0
        for i in range(n):
            for j in range(i + 1, n):
                if data[j] > data[i]:
                    s += 1
                elif data[j] < data[i]:
                    s -= 1

        # Calculate variance
        var_s = n * (n - 1) * (2 * n + 5) / 18

        # Calculate z-score
        if s > 0:
            z = (s - 1) / np.sqrt(var_s)
        elif s < 0:
            z = (s + 1) / np.sqrt(var_s)
        else:
            z = 0

        # Calculate p-value (two-tailed)
        from scipy.stats import norm
        p_value = 2 * (1 - norm.cdf(abs(z)))

        return z, p_value

    mk_z, mk_p = mann_kendall_test(time_series)

    # Theil-Sen slope estimator
    def theil_sen_slope(data, time):
        """Theil-Sen slope estimator"""
        slopes = []
        for i in range(len(data)):
            for j in range(i + 1, len(data)):
                if time[j] != time[i]:
                    slope = (data[j] - data[i]) / (time[j] - time[i])
                    slopes.append(slope)

        if len(slopes) > 0:
            return np.median(slopes)
        else:
            return np.nan

    theil_slope = theil_sen_slope(time_series, time)

    return {
        'linear_trend': trend_slope,
        'theil_slope': theil_slope,
        'mann_kendall_z': mk_z,
        'mann_kendall_p': mk_p
    }

# Example usage
trend_analysis = climate_trend_analysis(observed_temps, years)
print("\nClimate Trend Analysis:")
for key, value in trend_analysis.items():
    print(f"  {key}: {value:.4f}")

Summary

These climate data analysis workflows demonstrate how to use Monet Stats effectively for:

Temperature Trend Analysis: Multi-scale evaluation of long-term climate trends
Precipitation Extremes: Extreme value analysis and return period calculations
Multi-Model Ensembles: Ensemble construction and spread-skill relationships
Spatial Downscaling: High-resolution model evaluation and spatial verification

Each workflow can be adapted to specific climate data types and research questions. The key is to combine multiple complementary metrics and consider the specific characteristics of climate data.

For more specialized workflows, see:

Spatial & Ensemble Metrics
Wind Direction Metrics
Air Quality Model Evaluation