Spaces:
Sleeping
Sleeping
File size: 6,907 Bytes
3ac5032 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
#!/usr/bin/env python3
"""
Analyze hourly MAE patterns to establish baseline before optimization.
This script loads September 2025 forecast results and computes MAE per hour-of-day
to identify which hours have highest errors (likely ramping hours: 7-9, 17-21).
"""
import polars as pl
import numpy as np
from pathlib import Path
from datetime import datetime
# Paths
PROJECT_ROOT = Path(__file__).parent.parent
FORECAST_PATH = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet'
OUTPUT_PATH = PROJECT_ROOT / 'results' / 'september_2025_hourly_mae_baseline.csv'
def load_data():
"""Load forecast and actual data."""
print('[INFO] Loading forecast results...')
df_forecast = pl.read_parquet(FORECAST_PATH)
print(f'[INFO] Forecast shape: {df_forecast.shape}')
print(f'[INFO] Forecast period: {df_forecast["timestamp"].min()} to {df_forecast["timestamp"].max()}')
# Load actuals from HuggingFace dataset
print('[INFO] Loading actuals from HuggingFace dataset...')
from datasets import load_dataset
import os
dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN'))
df_actuals_full = pl.from_arrow(dataset.data.table)
# Filter actuals to forecast period (Sept 2-15, 2025)
forecast_start = datetime(2025, 9, 2)
forecast_end = datetime(2025, 9, 16)
df_actuals = df_actuals_full.filter(
(pl.col('timestamp') >= forecast_start) &
(pl.col('timestamp') < forecast_end)
)
print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours')
return df_forecast, df_actuals
def compute_hourly_mae(df_forecast, df_actuals):
"""Compute MAE per hour-of-day for all borders."""
print('[INFO] Computing hourly MAE...')
# Extract border names from forecast columns
forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')]
border_names = [col.replace('_median', '') for col in forecast_cols]
print(f'[INFO] Processing {len(border_names)} borders...')
hourly_results = []
for border in border_names:
forecast_col = f'{border}_median'
actual_col = f'target_border_{border}'
# Skip if actual column missing
if actual_col not in df_actuals.columns:
print(f'[WARNING] Skipping {border} - no actual data')
continue
# Create unified dataframe
df_border = df_forecast.select(['timestamp', forecast_col]).join(
df_actuals.select(['timestamp', actual_col]),
on='timestamp',
how='inner'
)
# Add hour-of-day
df_border = df_border.with_columns([
pl.col('timestamp').dt.hour().alias('hour')
])
# Compute MAE per hour
for hour in range(24):
hour_df = df_border.filter(pl.col('hour') == hour)
if len(hour_df) == 0:
continue
mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean()
hourly_results.append({
'border': border,
'hour': hour,
'mae': mae,
'n_hours': len(hour_df)
})
return pl.DataFrame(hourly_results)
def analyze_patterns(df_hourly):
"""Analyze hourly MAE patterns."""
print('\n' + '='*60)
print('HOURLY MAE ANALYSIS')
print('='*60)
# Overall statistics per hour (aggregated across all borders)
hourly_stats = df_hourly.group_by('hour').agg([
pl.col('mae').mean().alias('mean_mae'),
pl.col('mae').median().alias('median_mae'),
pl.col('mae').std().alias('std_mae'),
pl.col('mae').min().alias('min_mae'),
pl.col('mae').max().alias('max_mae'),
pl.col('border').count().alias('n_borders')
]).sort('hour')
print('\n[INFO] MAE by Hour-of-Day (Averaged Across All Borders):')
print(hourly_stats)
# Identify problem hours (highest MAE)
print('\n[INFO] Top 5 Worst Hours (Highest MAE):')
worst_hours = hourly_stats.sort('mean_mae', descending=True).head(5)
print(worst_hours)
# Identify best hours (lowest MAE)
print('\n[INFO] Top 5 Best Hours (Lowest MAE):')
best_hours = hourly_stats.sort('mean_mae').head(5)
print(best_hours)
# Ramping hour analysis
ramping_hours = [5, 6, 7, 8, 9, 17, 18, 19, 20, 21]
non_ramping_hours = [h for h in range(24) if h not in ramping_hours]
ramping_mae = hourly_stats.filter(pl.col('hour').is_in(ramping_hours))['mean_mae'].mean()
non_ramping_mae = hourly_stats.filter(pl.col('hour').is_in(non_ramping_hours))['mean_mae'].mean()
print(f'\n[INFO] Ramping hours (5-9, 17-21) MAE: {ramping_mae:.2f} MW')
print(f'[INFO] Non-ramping hours MAE: {non_ramping_mae:.2f} MW')
print(f'[INFO] Ramping penalty: {(ramping_mae - non_ramping_mae) / non_ramping_mae * 100:.1f}% higher')
# Peak hour analysis
peak_hours = [7, 8, 9, 17, 18, 19, 20]
peak_mae = hourly_stats.filter(pl.col('hour').is_in(peak_hours))['mean_mae'].mean()
print(f'\n[INFO] Peak hours (7-9, 17-20) MAE: {peak_mae:.2f} MW')
# Night hour analysis
night_hours = [22, 23, 0, 1, 2, 3, 4]
night_mae = hourly_stats.filter(pl.col('hour').is_in(night_hours))['mean_mae'].mean()
print(f'[INFO] Night hours (22-4) MAE: {night_mae:.2f} MW')
return hourly_stats
def identify_problematic_borders(df_hourly):
"""Identify borders with largest hourly MAE variations."""
print('\n[INFO] Borders with Highest Hourly MAE Variation:')
border_variation = df_hourly.group_by('border').agg([
pl.col('mae').mean().alias('mean_mae'),
pl.col('mae').std().alias('std_mae'),
pl.col('mae').max().alias('max_mae'),
(pl.col('mae').max() - pl.col('mae').min()).alias('range_mae')
]).sort('std_mae', descending=True)
print(border_variation.head(10))
return border_variation
def main():
"""Main analysis workflow."""
print('[START] Hourly MAE Baseline Analysis')
print(f'[INFO] Forecast file: {FORECAST_PATH}')
# Load data
df_forecast, df_actuals = load_data()
# Compute hourly MAE
df_hourly = compute_hourly_mae(df_forecast, df_actuals)
print(f'\n[INFO] Computed hourly MAE for {df_hourly["border"].n_unique()} borders')
# Analyze patterns
hourly_stats = analyze_patterns(df_hourly)
# Identify problematic borders
border_variation = identify_problematic_borders(df_hourly)
# Save detailed results
df_hourly.write_csv(OUTPUT_PATH)
print(f'\n[INFO] Detailed hourly MAE saved to: {OUTPUT_PATH}')
# Save summary stats
summary_path = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv'
hourly_stats.write_csv(summary_path)
print(f'[INFO] Hourly summary saved to: {summary_path}')
print('\n[SUCCESS] Hourly MAE baseline analysis complete!')
if __name__ == '__main__':
main()
|