Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 12,927 Bytes

#!/usr/bin/env python3
"""
Gradio Interface for Dynamic Forecast System
Interactive interface for time-aware forecasting with run date selection.
"""

import os
import gradio as gr
import polars as pl
import pandas as pd
from datetime import datetime, timedelta
from datasets import load_dataset
from src.forecasting.dynamic_forecast import DynamicForecast
from src.forecasting.feature_availability import FeatureAvailability

# Global variables for caching
dataset = None
forecaster = None
borders = None

def load_data():
    """Load dataset once at startup."""
    global dataset, forecaster, borders

    print("[*] Loading dataset from HuggingFace...")

    # Load HF token from environment variable
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError(
            "HF_TOKEN not found in environment variables. "
            "Please set HF_TOKEN in your environment or .env file."
        )

    ds = load_dataset(
        "evgueni-p/fbmc-features-24month",
        split="train",
        token=hf_token
    )
    dataset = pl.from_pandas(ds.to_pandas())

    # Ensure timestamp is datetime
    if dataset['timestamp'].dtype == pl.String:
        dataset = dataset.with_columns(pl.col('timestamp').str.to_datetime())
    elif dataset['timestamp'].dtype != pl.Datetime:
        dataset = dataset.with_columns(pl.col('timestamp').cast(pl.Datetime))

    # Initialize forecaster
    forecaster = DynamicForecast(
        dataset=dataset,
        context_hours=512,
        forecast_hours=336  # Fixed at 14 days
    )

    # Extract borders
    target_cols = [col for col in dataset.columns if col.startswith('target_border_')]
    borders = [col.replace('target_border_', '') for col in target_cols]

    print(f"[OK] Loaded {len(dataset)} rows, {len(dataset.columns)} columns")
    print(f"[OK] Found {len(borders)} borders")
    print(f"[OK] Date range: {dataset['timestamp'].min()} to {dataset['timestamp'].max()}")

    return True


def get_dataset_info():
    """Get dataset information for display."""
    if dataset is None:
        return "Dataset not loaded"

    date_min = str(dataset['timestamp'].min())
    date_max = str(dataset['timestamp'].max())

    info = f"""
    **Dataset Information**
    - Total rows: {len(dataset):,}
    - Total columns: {len(dataset.columns)}
    - Date range: {date_min} to {date_max}
    - Borders available: {len(borders)}
    """
    return info


def get_feature_summary():
    """Get feature categorization summary."""
    if forecaster is None:
        return "Forecaster not initialized"

    summary = forecaster.get_feature_summary()

    text = f"""
    **Feature Categorization**
    - Full-horizon D+14: {summary['full_horizon_d14']} features
      (temporal, weather, CNEC outages, LTA)
    - Partial D+1: {summary['partial_d1']} features
      (load forecasts, masked D+2-D+14)
    - Historical only: {summary['historical']} features
      (prices, generation, demand, lags, etc.)
    - **Total: {summary['total']} features**
    """
    return text


def validate_run_date(run_date_str):
    """Validate run date is within dataset bounds."""
    if not run_date_str:
        return False, "Please select a run date"

    try:
        run_date = datetime.strptime(run_date_str, "%Y-%m-%d %H:%M:%S")
    except:
        return False, "Invalid date format (use YYYY-MM-DD HH:MM:SS)"

    dataset_min = dataset['timestamp'].min()
    dataset_max = dataset['timestamp'].max()

    # Run date must have 512 hours of context before it
    min_valid = dataset_min + timedelta(hours=512)
    # Run date must have 336 hours of future data after it
    max_valid = dataset_max - timedelta(hours=336)

    if run_date < min_valid:
        return False, f"Run date too early (need 512h context). Minimum: {min_valid}"

    if run_date > max_valid:
        return False, f"Run date too late (need 336h future data). Maximum: {max_valid}"

    return True, "Run date valid"


def prepare_forecast(run_date_str, border):
    """Prepare forecast data for selected run date and border."""
    if dataset is None or forecaster is None:
        return "Error: Dataset not loaded", "", ""

    # Validate inputs
    if not border:
        return "Error: Please select a border", "", ""

    is_valid, msg = validate_run_date(run_date_str)
    if not is_valid:
        return f"Error: {msg}", "", ""

    try:
        run_date = datetime.strptime(run_date_str, "%Y-%m-%d %H:%M:%S")

        # Prepare data
        context_data, future_data = forecaster.prepare_forecast_data(run_date, border)

        # Validate no leakage
        is_valid, errors = forecaster.validate_no_leakage(
            context_data, future_data, run_date
        )

        if not is_valid:
            error_msg = "Data leakage detected:\n" + "\n".join(f"- {e}" for e in errors)
            return error_msg, "", ""

        # Build result summary
        forecast_start = run_date + timedelta(hours=1)
        forecast_end = forecast_start + timedelta(hours=335)

        result = f"""
        **Forecast Configuration**
        - Border: {border}
        - Run date: {run_date}
        - Forecast horizon: D+1 to D+14 (336 hours, FIXED)
        - Forecast period: {forecast_start} to {forecast_end}

        **Data Preparation Summary**
        - Context shape: {context_data.shape} (historical data)
        - Future shape: {future_data.shape} (future covariates)
        - Context dates: {context_data['timestamp'].min()} to {context_data['timestamp'].max()}
        - Future dates: {future_data['timestamp'].min()} to {future_data['timestamp'].max()}
        - Leakage validation: PASSED

        **Feature Availability**
        - Full-horizon D+14: Available for all 336 hours
        - Partial D+1 (load forecasts): Available for first 24 hours, masked 25-336
        - Historical features: Not used for forecasting (context only)

        **Next Steps**
        1. Data has been prepared with time-aware extraction
        2. Load forecast masking applied (D+1 only)
        3. LTA forward-filling applied (constant across horizon)
        4. Ready for Chronos-2 inference (requires GPU)

        **Note**: This is a dry-run demonstration. Actual inference requires GPU with Chronos-2 model.
        """

        # Create context preview
        context_preview = context_data.head(10).to_string()

        # Create future preview
        future_preview = future_data.head(10).to_string()

        return result, context_preview, future_preview

    except Exception as e:
        return f"Error: {str(e)}", "", ""


def create_interface():
    """Create Gradio interface."""
    # Load data at startup
    load_data()

    with gr.Blocks(title="FBMC Dynamic Forecast System") as app:
        gr.Markdown("# FBMC Dynamic Forecast System")
        gr.Markdown("""
        **Time-Aware Forecasting with Run Date Selection**

        This interface demonstrates the dynamic forecast pipeline that prevents data leakage
        by using only data available at the selected run date.

        **Key Features**:
        - Dynamic run date selection (prevents data leakage)
        - Fixed 14-day forecast horizon (D+1 to D+14, always 336 hours)
        - Time-aware feature categorization (603 full + 12 partial + 1,899 historical)
        - Availability masking for partial features (load forecasts D+1 only)
        - Built-in leakage validation
        """)

        with gr.Tab("Forecast Configuration"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Dataset Information")
                    dataset_info = gr.Textbox(
                        label="Dataset Info",
                        value=get_dataset_info(),
                        lines=8,
                        interactive=False
                    )

                    feature_summary = gr.Textbox(
                        label="Feature Summary",
                        value=get_feature_summary(),
                        lines=10,
                        interactive=False
                    )

                with gr.Column():
                    gr.Markdown("### Forecast Configuration")

                    run_date_input = gr.Textbox(
                        label="Run Date (YYYY-MM-DD HH:MM:SS)",
                        placeholder="2025-08-15 23:00:00",
                        value="2025-08-15 23:00:00"
                    )

                    border_dropdown = gr.Dropdown(
                        label="Border",
                        choices=borders if borders else [],
                        value=borders[0] if borders else None
                    )

                    gr.Markdown("""
                    **Forecast Horizon**: Fixed at 14 days (D+1 to D+14, 336 hours)

                    **Validation Rules**:
                    - Run date must have 512 hours of historical context
                    - Run date must have 336 hours of future data (for this demo)
                    - Valid range: ~22 days from dataset start to ~14 days before dataset end
                    """)

                    prepare_btn = gr.Button("Prepare Forecast Data", variant="primary")

            with gr.Row():
                result_output = gr.Textbox(
                    label="Forecast Preparation Result",
                    lines=25,
                    interactive=False
                )

        with gr.Tab("Data Preview"):
            with gr.Row():
                context_preview = gr.Textbox(
                    label="Context Data (first 10 rows)",
                    lines=20,
                    interactive=False
                )

                future_preview = gr.Textbox(
                    label="Future Covariates (first 10 rows)",
                    lines=20,
                    interactive=False
                )

        with gr.Tab("About"):
            gr.Markdown("""
            ## About This System

            ### Purpose
            Prevent data leakage in FBMC cross-border flow forecasting by implementing
            time-aware data extraction that respects feature availability windows.

            ### Architecture
            1. **Feature Categorization**: All 2,514 features categorized by availability
               - Full-horizon D+14: 603 features (temporal, weather, outages, LTA)
               - Partial D+1: 12 features (load forecasts, masked D+2-D+14)
               - Historical: 1,899 features (prices, generation, demand, lags)

            2. **Time-Aware Extraction**: DynamicForecast class
               - Extracts context data (all data before run_date)
               - Extracts future covariates (D+1 to D+14 only)
               - Applies availability masking for partial features

            3. **Leakage Validation**: Built-in checks
               - Context timestamps < run_date
               - Future timestamps >= run_date + 1 hour
               - No overlap between context and future
               - Only future covariates in future data

            ### Forecast Horizon
            - **FIXED at 14 days** (D+1 to D+14, 336 hours)
            - No horizon selector needed (always forecasts full 14 days)
            - D+1 starts 1 hour after run_date (ET convention)

            ### Feature Availability
            - **Load Forecasts**: Published day-ahead, available D+1 only
            - **Weather**: Forecasts available for full D+14 horizon
            - **CNEC Outages**: Planned maintenance published weeks ahead
            - **LTA**: Long-term allocations, forward-filled from D+0
            - **Historical**: Prices, generation, demand (context only)

            ### Time Conventions
            - **Electricity Time (ET)**: Hour 1 = 00:00-01:00, Hour 24 = 23:00-00:00
            - **D+1**: Next day, hours 1-24 (24 hours starting at 00:00)
            - **D+14**: 14 days ahead (336 hours total)

            ### Model
            - **Chronos 2 Large** (710M params, zero-shot inference)
            - Supports partial availability via NaN masking
            - Multivariate time series forecasting

            ### Files
            - `src/forecasting/feature_availability.py`: Feature categorization
            - `src/forecasting/dynamic_forecast.py`: Time-aware data extraction
            - `smoke_test.py`, `full_inference.py`: Updated inference scripts
            - `tests/test_feature_availability.py`: Unit tests (27 tests, all passing)

            ### Authors
            Evgueni Poloukarov, 2025-11-13
            """)

        # Wire up the button
        prepare_btn.click(
            fn=prepare_forecast,
            inputs=[run_date_input, border_dropdown],
            outputs=[result_output, context_preview, future_preview]
        )

    return app


if __name__ == "__main__":
    app = create_interface()
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )