Spaces:
Sleeping
Sleeping
Evgueni Poloukarov
feat: implement hour-aware adaptive quantile selection for hourly accuracy
3ac5032
| #!/usr/bin/env python3 | |
| """ | |
| Analyze hourly MAE patterns to establish baseline before optimization. | |
| This script loads September 2025 forecast results and computes MAE per hour-of-day | |
| to identify which hours have highest errors (likely ramping hours: 7-9, 17-21). | |
| """ | |
| import polars as pl | |
| import numpy as np | |
| from pathlib import Path | |
| from datetime import datetime | |
| # Paths | |
| PROJECT_ROOT = Path(__file__).parent.parent | |
| FORECAST_PATH = PROJECT_ROOT / 'results' / 'september_2025_forecast_full_14day.parquet' | |
| OUTPUT_PATH = PROJECT_ROOT / 'results' / 'september_2025_hourly_mae_baseline.csv' | |
| def load_data(): | |
| """Load forecast and actual data.""" | |
| print('[INFO] Loading forecast results...') | |
| df_forecast = pl.read_parquet(FORECAST_PATH) | |
| print(f'[INFO] Forecast shape: {df_forecast.shape}') | |
| print(f'[INFO] Forecast period: {df_forecast["timestamp"].min()} to {df_forecast["timestamp"].max()}') | |
| # Load actuals from HuggingFace dataset | |
| print('[INFO] Loading actuals from HuggingFace dataset...') | |
| from datasets import load_dataset | |
| import os | |
| dataset = load_dataset('evgueni-p/fbmc-features-24month', split='train', token=os.environ.get('HF_TOKEN')) | |
| df_actuals_full = pl.from_arrow(dataset.data.table) | |
| # Filter actuals to forecast period (Sept 2-15, 2025) | |
| forecast_start = datetime(2025, 9, 2) | |
| forecast_end = datetime(2025, 9, 16) | |
| df_actuals = df_actuals_full.filter( | |
| (pl.col('timestamp') >= forecast_start) & | |
| (pl.col('timestamp') < forecast_end) | |
| ) | |
| print(f'[INFO] Actuals filtered: {df_actuals.shape[0]} hours') | |
| return df_forecast, df_actuals | |
| def compute_hourly_mae(df_forecast, df_actuals): | |
| """Compute MAE per hour-of-day for all borders.""" | |
| print('[INFO] Computing hourly MAE...') | |
| # Extract border names from forecast columns | |
| forecast_cols = [col for col in df_forecast.columns if col.endswith('_median')] | |
| border_names = [col.replace('_median', '') for col in forecast_cols] | |
| print(f'[INFO] Processing {len(border_names)} borders...') | |
| hourly_results = [] | |
| for border in border_names: | |
| forecast_col = f'{border}_median' | |
| actual_col = f'target_border_{border}' | |
| # Skip if actual column missing | |
| if actual_col not in df_actuals.columns: | |
| print(f'[WARNING] Skipping {border} - no actual data') | |
| continue | |
| # Create unified dataframe | |
| df_border = df_forecast.select(['timestamp', forecast_col]).join( | |
| df_actuals.select(['timestamp', actual_col]), | |
| on='timestamp', | |
| how='inner' | |
| ) | |
| # Add hour-of-day | |
| df_border = df_border.with_columns([ | |
| pl.col('timestamp').dt.hour().alias('hour') | |
| ]) | |
| # Compute MAE per hour | |
| for hour in range(24): | |
| hour_df = df_border.filter(pl.col('hour') == hour) | |
| if len(hour_df) == 0: | |
| continue | |
| mae = (hour_df[forecast_col] - hour_df[actual_col]).abs().mean() | |
| hourly_results.append({ | |
| 'border': border, | |
| 'hour': hour, | |
| 'mae': mae, | |
| 'n_hours': len(hour_df) | |
| }) | |
| return pl.DataFrame(hourly_results) | |
| def analyze_patterns(df_hourly): | |
| """Analyze hourly MAE patterns.""" | |
| print('\n' + '='*60) | |
| print('HOURLY MAE ANALYSIS') | |
| print('='*60) | |
| # Overall statistics per hour (aggregated across all borders) | |
| hourly_stats = df_hourly.group_by('hour').agg([ | |
| pl.col('mae').mean().alias('mean_mae'), | |
| pl.col('mae').median().alias('median_mae'), | |
| pl.col('mae').std().alias('std_mae'), | |
| pl.col('mae').min().alias('min_mae'), | |
| pl.col('mae').max().alias('max_mae'), | |
| pl.col('border').count().alias('n_borders') | |
| ]).sort('hour') | |
| print('\n[INFO] MAE by Hour-of-Day (Averaged Across All Borders):') | |
| print(hourly_stats) | |
| # Identify problem hours (highest MAE) | |
| print('\n[INFO] Top 5 Worst Hours (Highest MAE):') | |
| worst_hours = hourly_stats.sort('mean_mae', descending=True).head(5) | |
| print(worst_hours) | |
| # Identify best hours (lowest MAE) | |
| print('\n[INFO] Top 5 Best Hours (Lowest MAE):') | |
| best_hours = hourly_stats.sort('mean_mae').head(5) | |
| print(best_hours) | |
| # Ramping hour analysis | |
| ramping_hours = [5, 6, 7, 8, 9, 17, 18, 19, 20, 21] | |
| non_ramping_hours = [h for h in range(24) if h not in ramping_hours] | |
| ramping_mae = hourly_stats.filter(pl.col('hour').is_in(ramping_hours))['mean_mae'].mean() | |
| non_ramping_mae = hourly_stats.filter(pl.col('hour').is_in(non_ramping_hours))['mean_mae'].mean() | |
| print(f'\n[INFO] Ramping hours (5-9, 17-21) MAE: {ramping_mae:.2f} MW') | |
| print(f'[INFO] Non-ramping hours MAE: {non_ramping_mae:.2f} MW') | |
| print(f'[INFO] Ramping penalty: {(ramping_mae - non_ramping_mae) / non_ramping_mae * 100:.1f}% higher') | |
| # Peak hour analysis | |
| peak_hours = [7, 8, 9, 17, 18, 19, 20] | |
| peak_mae = hourly_stats.filter(pl.col('hour').is_in(peak_hours))['mean_mae'].mean() | |
| print(f'\n[INFO] Peak hours (7-9, 17-20) MAE: {peak_mae:.2f} MW') | |
| # Night hour analysis | |
| night_hours = [22, 23, 0, 1, 2, 3, 4] | |
| night_mae = hourly_stats.filter(pl.col('hour').is_in(night_hours))['mean_mae'].mean() | |
| print(f'[INFO] Night hours (22-4) MAE: {night_mae:.2f} MW') | |
| return hourly_stats | |
| def identify_problematic_borders(df_hourly): | |
| """Identify borders with largest hourly MAE variations.""" | |
| print('\n[INFO] Borders with Highest Hourly MAE Variation:') | |
| border_variation = df_hourly.group_by('border').agg([ | |
| pl.col('mae').mean().alias('mean_mae'), | |
| pl.col('mae').std().alias('std_mae'), | |
| pl.col('mae').max().alias('max_mae'), | |
| (pl.col('mae').max() - pl.col('mae').min()).alias('range_mae') | |
| ]).sort('std_mae', descending=True) | |
| print(border_variation.head(10)) | |
| return border_variation | |
| def main(): | |
| """Main analysis workflow.""" | |
| print('[START] Hourly MAE Baseline Analysis') | |
| print(f'[INFO] Forecast file: {FORECAST_PATH}') | |
| # Load data | |
| df_forecast, df_actuals = load_data() | |
| # Compute hourly MAE | |
| df_hourly = compute_hourly_mae(df_forecast, df_actuals) | |
| print(f'\n[INFO] Computed hourly MAE for {df_hourly["border"].n_unique()} borders') | |
| # Analyze patterns | |
| hourly_stats = analyze_patterns(df_hourly) | |
| # Identify problematic borders | |
| border_variation = identify_problematic_borders(df_hourly) | |
| # Save detailed results | |
| df_hourly.write_csv(OUTPUT_PATH) | |
| print(f'\n[INFO] Detailed hourly MAE saved to: {OUTPUT_PATH}') | |
| # Save summary stats | |
| summary_path = PROJECT_ROOT / 'results' / 'september_2025_hourly_summary.csv' | |
| hourly_stats.write_csv(summary_path) | |
| print(f'[INFO] Hourly summary saved to: {summary_path}') | |
| print('\n[SUCCESS] Hourly MAE baseline analysis complete!') | |
| if __name__ == '__main__': | |
| main() | |