fbmc-chronos2 / scripts /analyze_hourly_mae.py
Evgueni Poloukarov
feat: implement hour-aware adaptive quantile selection for hourly accuracy
3ac5032
#!/usr/bin/env python3
"""
Analyze hourly MAE patterns to establish baseline before optimization.
This script loads September 2025 forecast results and computes MAE per hour-of-day
to identify which hours have highest errors (likely ramping hours: 7-9, 17-21).
"""
import polars as pl
import numpy as np
from pathlib import Path
from datetime import datetime
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
FORECAST_PATH = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet'
OUTPUT_PATH = PROJECT_ROOT / 'results' / 'september_2025_hourly_mae_baseline.csv'
def load_data():
"""Load forecast and actual data."""
print('[INFO] Loading forecast results...')
df_forecast = pl.read_parquet(FORECAST_PATH)
print(f'[INFO] Forecast shape: {df_forecast.shape}')
print(f'[INFO] Forecast period: {df_forecast["timestamp"].min()} to {df_forecast["timestamp"].max()}')
# Load actuals from HuggingFace dataset
print('[INFO] Loading actuals from HuggingFace dataset...')
from datasets import load_dataset
import os
dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN'))
df_actuals_full = pl.from_arrow(dataset.data.table)
# Filter actuals to forecast period (Sept 2-15, 2025)
forecast_start = datetime(2025, 9, 2)
forecast_end = datetime(2025, 9, 16)
df_actuals = df_actuals_full.filter(
(pl.col('timestamp') >= forecast_start) &
(pl.col('timestamp') < forecast_end)
)
print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours')
return df_forecast, df_actuals
def compute_hourly_mae(df_forecast, df_actuals):
"""Compute MAE per hour-of-day for all borders."""
print('[INFO] Computing hourly MAE...')
# Extract border names from forecast columns
forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')]
border_names = [col.replace('_median', '') for col in forecast_cols]
print(f'[INFO] Processing {len(border_names)} borders...')
hourly_results = []
for border in border_names:
forecast_col = f'{border}_median'
actual_col = f'target_border_{border}'
# Skip if actual column missing
if actual_col not in df_actuals.columns:
print(f'[WARNING] Skipping {border} - no actual data')
continue
# Create unified dataframe
df_border = df_forecast.select(['timestamp', forecast_col]).join(
df_actuals.select(['timestamp', actual_col]),
on='timestamp',
how='inner'
)
# Add hour-of-day
df_border = df_border.with_columns([
pl.col('timestamp').dt.hour().alias('hour')
])
# Compute MAE per hour
for hour in range(24):
hour_df = df_border.filter(pl.col('hour') == hour)
if len(hour_df) == 0:
continue
mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean()
hourly_results.append({
'border': border,
'hour': hour,
'mae': mae,
'n_hours': len(hour_df)
})
return pl.DataFrame(hourly_results)
def analyze_patterns(df_hourly):
"""Analyze hourly MAE patterns."""
print('\n' + '='*60)
print('HOURLY MAE ANALYSIS')
print('='*60)
# Overall statistics per hour (aggregated across all borders)
hourly_stats = df_hourly.group_by('hour').agg([
pl.col('mae').mean().alias('mean_mae'),
pl.col('mae').median().alias('median_mae'),
pl.col('mae').std().alias('std_mae'),
pl.col('mae').min().alias('min_mae'),
pl.col('mae').max().alias('max_mae'),
pl.col('border').count().alias('n_borders')
]).sort('hour')
print('\n[INFO] MAE by Hour-of-Day (Averaged Across All Borders):')
print(hourly_stats)
# Identify problem hours (highest MAE)
print('\n[INFO] Top 5 Worst Hours (Highest MAE):')
worst_hours = hourly_stats.sort('mean_mae', descending=True).head(5)
print(worst_hours)
# Identify best hours (lowest MAE)
print('\n[INFO] Top 5 Best Hours (Lowest MAE):')
best_hours = hourly_stats.sort('mean_mae').head(5)
print(best_hours)
# Ramping hour analysis
ramping_hours = [5, 6, 7, 8, 9, 17, 18, 19, 20, 21]
non_ramping_hours = [h for h in range(24) if h not in ramping_hours]
ramping_mae = hourly_stats.filter(pl.col('hour').is_in(ramping_hours))['mean_mae'].mean()
non_ramping_mae = hourly_stats.filter(pl.col('hour').is_in(non_ramping_hours))['mean_mae'].mean()
print(f'\n[INFO] Ramping hours (5-9, 17-21) MAE: {ramping_mae:.2f} MW')
print(f'[INFO] Non-ramping hours MAE: {non_ramping_mae:.2f} MW')
print(f'[INFO] Ramping penalty: {(ramping_mae - non_ramping_mae) / non_ramping_mae * 100:.1f}% higher')
# Peak hour analysis
peak_hours = [7, 8, 9, 17, 18, 19, 20]
peak_mae = hourly_stats.filter(pl.col('hour').is_in(peak_hours))['mean_mae'].mean()
print(f'\n[INFO] Peak hours (7-9, 17-20) MAE: {peak_mae:.2f} MW')
# Night hour analysis
night_hours = [22, 23, 0, 1, 2, 3, 4]
night_mae = hourly_stats.filter(pl.col('hour').is_in(night_hours))['mean_mae'].mean()
print(f'[INFO] Night hours (22-4) MAE: {night_mae:.2f} MW')
return hourly_stats
def identify_problematic_borders(df_hourly):
"""Identify borders with largest hourly MAE variations."""
print('\n[INFO] Borders with Highest Hourly MAE Variation:')
border_variation = df_hourly.group_by('border').agg([
pl.col('mae').mean().alias('mean_mae'),
pl.col('mae').std().alias('std_mae'),
pl.col('mae').max().alias('max_mae'),
(pl.col('mae').max() - pl.col('mae').min()).alias('range_mae')
]).sort('std_mae', descending=True)
print(border_variation.head(10))
return border_variation
def main():
"""Main analysis workflow."""
print('[START] Hourly MAE Baseline Analysis')
print(f'[INFO] Forecast file: {FORECAST_PATH}')
# Load data
df_forecast, df_actuals = load_data()
# Compute hourly MAE
df_hourly = compute_hourly_mae(df_forecast, df_actuals)
print(f'\n[INFO] Computed hourly MAE for {df_hourly["border"].n_unique()} borders')
# Analyze patterns
hourly_stats = analyze_patterns(df_hourly)
# Identify problematic borders
border_variation = identify_problematic_borders(df_hourly)
# Save detailed results
df_hourly.write_csv(OUTPUT_PATH)
print(f'\n[INFO] Detailed hourly MAE saved to: {OUTPUT_PATH}')
# Save summary stats
summary_path = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv'
hourly_stats.write_csv(summary_path)
print(f'[INFO] Hourly summary saved to: {summary_path}')
print('\n[SUCCESS] Hourly MAE baseline analysis complete!')
if __name__ == '__main__':
main()