Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

File size: 19,463 Bytes

27cb60a

"""FBMC Flow Forecasting - Unified JAO Data Exploration

Objective: Explore unified 24-month JAO data and engineered features

This notebook explores:
1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos)
2. Engineered features (726 features across 5 categories)
3. Feature completeness and validation
4. Key statistics and distributions

Usage:
    marimo edit notebooks/02_unified_jao_exploration.py
"""

import marimo

__generated_with = "0.17.2"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import polars as pl
    import altair as alt
    from pathlib import Path
    import numpy as np
    return Path, alt, mo, pl


@app.cell
def _(mo):
    mo.md(
        r"""
    # Unified JAO Data Exploration (24 Months)

    **Date Range**: October 2023 - October 2025 (24 months)

    ## Data Pipeline Overview:

    1. **Raw JAO Data** (4 datasets)
       - MaxBEX: Maximum Bilateral Exchange capacity (TARGET)
       - CNEC/PTDF: Critical constraints with power transfer factors
       - LTA: Long Term Allocations (future covariates)
       - Net Positions: Domain boundaries (min/max per zone)

    2. **Data Unification** → `unified_jao_24month.parquet`
       - Deduplicated NetPos (removed 1,152 duplicate timestamps)
       - Forward-filled LTA gaps (710 missing hours)
       - Broadcast daily CNEC to hourly
       - Sorted timeline (hourly, 17,544 records)

    3. **Feature Engineering** → `features_jao_24month.parquet`
       - 726 features across 5 categories
       - Tier-1 CNEC: 274 features
       - Tier-2 CNEC: 390 features
       - LTA: 40 features
       - Temporal: 12 features
       - Targets: 10 features
    """
    )
    return


@app.cell
def _(Path, pl):
    # Load unified datasets
    print("Loading unified JAO datasets...")

    processed_dir = Path('data/processed')

    unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet')
    cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet')
    features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet')

    print(f"[OK] Unified JAO: {unified_jao.shape}")
    print(f"[OK] CNEC hourly: {cnec_hourly.shape}")
    print(f"[OK] Features: {features_jao.shape}")
    return features_jao, unified_jao


@app.cell
def _(features_jao, mo, unified_jao):
    # Dataset overview
    mo.md(f"""
    ## Dataset Overview

    ### 1. Unified JAO Dataset
    - **Shape**: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns
    - **Date Range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
    - **Timeline Sorted**: {unified_jao['mtu'].is_sorted()}
    - **Null Percentage**: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%

    ### 2. Engineered Features
    - **Shape**: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns
    - **Total Features**: {features_jao.shape[1] - 1} (excluding mtu timestamp)
    - **Null Percentage**: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%
      - _Note: High nulls expected due to sparse CNEC binding patterns and lag features_
    """)
    return


@app.cell
def _(mo):
    mo.md("""## 1. Unified JAO Dataset Structure""")
    return


@app.cell
def _(mo, unified_jao):
    # Show sample of unified data
    mo.md("""### Sample Data (First 20 Rows)""")
    mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(mo, unified_jao):
    # Column breakdown
    maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')]
    lta_cols = [c for c in unified_jao.columns if c.startswith('border_')]
    netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')]

    mo.md(f"""
    ### Column Breakdown

    - **Timestamp**: 1 column (`mtu`)
    - **MaxBEX Borders**: {len(maxbex_cols)} columns
    - **LTA Borders**: {len(lta_cols)} columns
    - **Net Positions**: {len(netpos_cols)} columns (if present)
    - **Total**: {unified_jao.shape[1]} columns
    """)
    return


@app.cell
def _(mo):
    mo.md("""### Timeline Validation""")
    return


@app.cell
def _(alt, pl, unified_jao):
    # Timeline validation
    time_diffs = unified_jao['mtu'].diff().drop_nulls()

    # Most common time diff
    most_common = time_diffs.mode()[0]
    is_hourly = most_common.total_seconds() == 3600

    # Create histogram of time diffs
    time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64)

    time_diff_df = pl.DataFrame({
        'time_diff_hours': time_diff_hours
    })

    timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode(
        x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'),
        y=alt.Y('count()', title='Count'),
        tooltip=['time_diff_hours:Q', 'count()']
    ).properties(
        title='Timeline Gaps Distribution',
        width=800,
        height=300
    )

    timeline_chart
    return is_hourly, most_common


@app.cell
def _(is_hourly, mo, most_common):
    if is_hourly:
        mo.md(f"""
        ✅ **Timeline Validation: PASS**
        - Most common time diff: {most_common} (1 hour)
        - Timeline is properly sorted and hourly
        """)
    else:
        mo.md(f"""
        ⚠️ **Timeline Validation: WARNING**
        - Most common time diff: {most_common}
        - Expected: 1 hour
        """)
    return


@app.cell
def _(mo):
    mo.md("""## 2. Feature Engineering Results""")
    return


@app.cell
def _(features_jao, mo, pl):
    # Feature category breakdown
    tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
    tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
    lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
    temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
    target_cols = [c for c in features_jao.columns if c.startswith('target_')]

    # Create summary table
    feature_summary = pl.DataFrame({
        'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
        'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1],
        'Null %': [
            f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A",
            f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A",
            f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A",
            f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A",
            f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A",
            f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%"
        ]
    })

    mo.ui.table(feature_summary.to_pandas())
    return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


@app.cell
def _(mo):
    mo.md("""### Sample Features (First 20 Rows)""")
    return


@app.cell
def _(features_jao, mo):
    # Show first 10 columns only (too many to display all)
    mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(mo):
    mo.md("""## 3. LTA Features (Future Covariates)""")
    return


@app.cell
def _(lta_feat_cols, mo):
    # LTA features analysis
    mo.md(f"""
    **LTA Features**: {len(lta_feat_cols)} features

    LTA (Long Term Allocations) are **future covariates** - known years in advance via auctions.
    These should have **0% nulls** since they're available for the entire forecast horizon.
    """)
    return


@app.cell
def _(alt, features_jao):
    # Plot LTA total allocated over time
    lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu')

    lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode(
        x=alt.X('mtu:T', title='Date'),
        y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'),
        tooltip=['mtu:T', 'lta_total_allocated:Q']
    ).properties(
        title='LTA Total Allocated Capacity Over Time',
        width=800,
        height=400
    ).interactive()

    lta_chart
    return


@app.cell
def _(features_jao, lta_feat_cols, mo):
    # LTA statistics
    lta_stats = features_jao.select(lta_feat_cols[:5]).describe()

    mo.md("""### LTA Sample Statistics (First 5 Features)""")
    mo.ui.table(lta_stats.to_pandas())
    return


@app.cell
def _(mo):
    mo.md("""## 4. Temporal Features""")
    return


@app.cell
def _(features_jao, mo, temporal_cols):
    # Show temporal features
    mo.md(f"""
    **Temporal Features**: {len(temporal_cols)} features

    Cyclic encoding for hour, month, and weekday to capture periodicity.
    """)

    mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas())
    return


@app.cell
def _(alt, features_jao, pl):
    # Hourly distribution
    hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour')

    hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode(
        x=alt.X('hour:O', title='Hour of Day'),
        y=alt.Y('count:Q', title='Count'),
        tooltip=['hour:O', 'count:Q']
    ).properties(
        title='Distribution by Hour of Day',
        width=800,
        height=300
    )

    hour_chart
    return


@app.cell
def _(mo):
    mo.md("""## 5. CNEC Features (Historical)""")
    return


@app.cell
def _(features_jao, mo, tier1_cols, tier2_cols):
    # CNEC features overview
    mo.md(f"""
    **CNEC Features**: {len(tier1_cols) + len(tier2_cols)} total

    - **Tier-1 CNECs**: {len(tier1_cols)} features (top 58 most critical CNECs)
    - **Tier-2 CNECs**: {len(tier2_cols)} features (next 150 CNECs)

    High null percentage is **expected** due to:
    1. Sparse binding patterns (not all CNECs bind every hour)
    2. Lag features create nulls at timeline start
    3. Pivoting creates sparse constraint matrices
    """)

    # Sample Tier-1 features
    mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(alt, features_jao, pl, tier1_cols):
    # Binding frequency for sample Tier-1 CNECs
    binding_cols = [c for c in tier1_cols if 'binding_' in c][:10]

    if binding_cols:
        binding_freq = pl.DataFrame({
            'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols],
            'binding_rate': [features_jao[c].mean() for c in binding_cols]
        })

        binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode(
            x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'),
            y=alt.Y('cnec:N', sort='-x', title='CNEC'),
            tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')]
        ).properties(
            title='Binding Frequency - Sample Tier-1 CNECs',
            width=800,
            height=300
        )

        binding_chart
    else:
        None
    return


@app.cell
def _(mo):
    mo.md("""## 6. Target Variables""")
    return


@app.cell
def _(features_jao, mo, target_cols):
    # Show target variables (MaxBEX borders)
    mo.md(f"""
    **Target Variables**: {len(target_cols)} features

    Sample MaxBEX borders for forecasting (first 10 borders):
    """)

    if target_cols:
        mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(alt, features_jao, target_cols):
    # Plot sample target variable over time
    if target_cols:
        sample_target = target_cols[0]

        target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu')

        target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode(
            x=alt.X('mtu:T', title='Date'),
            y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'),
            tooltip=['mtu:T', f'{sample_target}:Q']
        ).properties(
            title=f'Target Variable Over Time: {sample_target}',
            width=800,
            height=400
        ).interactive()

        target_chart
    else:
        None
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## 7. Data Quality Summary

    Final validation checks:
    """
    )
    return


@app.cell
def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao):
    # Data quality checks
    checks = []

    # Check 1: Timeline sorted and hourly
    checks.append({
        'Check': 'Timeline sorted & hourly',
        'Status': 'PASS' if is_hourly else 'FAIL',
        'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}'
    })

    # Check 2: No nulls in unified dataset
    unified_nulls = unified_jao.null_count().sum_horizontal()[0]
    checks.append({
        'Check': 'Unified data completeness',
        'Status': 'PASS' if unified_nulls == 0 else 'WARNING',
        'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)'
    })

    # Check 3: LTA features have no nulls (future covariates)
    lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0
    checks.append({
        'Check': 'LTA future covariates complete',
        'Status': 'PASS' if lta_nulls == 0 else 'FAIL',
        'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features'
    })

    # Check 4: Data consistency (same row count)
    checks.append({
        'Check': 'Data consistency',
        'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL',
        'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows'
    })

    checks_df = pl.DataFrame(checks)

    mo.ui.table(checks_df.to_pandas())
    return (checks,)


@app.cell
def _(checks, mo):
    # Overall status
    all_pass = all(c['Status'] == 'PASS' for c in checks)

    if all_pass:
        mo.md("""
        ✅ **All validation checks PASSED**

        Data is ready for model training and inference!
        """)
    else:
        failed = [c['Check'] for c in checks if c['Status'] == 'FAIL']
        warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING']

        status = "⚠️ **Some checks failed or have warnings**\n\n"
        if failed:
            status += f"**Failed**: {', '.join(failed)}\n\n"
        if warnings:
            status += f"**Warnings**: {', '.join(warnings)}"

        mo.md(status)
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## Next Steps

    ✅ **JAO Data Collection & Unification: COMPLETE**
    - 24 months of data (Oct 2023 - Oct 2025)
    - 17,544 hourly records
    - 726 features engineered

    **Remaining Work:**
    1. Collect weather data (OpenMeteo, 52 grid points)
    2. Collect ENTSO-E data (generation, flows, outages)
    3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates)
    4. Integrate all data sources
    5. Begin zero-shot Chronos 2 inference

    ---

    **Data Files**:
    - `data/processed/unified_jao_24month.parquet` (5.59 MB)
    - `data/processed/cnec_hourly_24month.parquet` (4.57 MB)
    - `data/processed/features_jao_24month.parquet` (0.60 MB)
    """
    )
    return


@app.cell
def _(mo, unified_jao):
    # Display the unified JAO dataset
    mo.md("## Unified JAO Dataset")
    mo.ui.table(unified_jao.to_pandas(), page_size=20)
    return


@app.cell
def _(features_jao, mo, unified_jao):
    # Show the actual structure with timestamp
    mo.md("### Unified JAO Dataset Structure")
    display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10)
    mo.ui.table(display_df.to_pandas())

    mo.md(f"""
    **Dataset Info:**
    - **Total columns**: {len(unified_jao.columns)}
    - **Timestamp column**: `mtu` (Market Time Unit)
    - **Date range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
    """)

    # Show the 726 features dataset separately
    mo.md("### Features Dataset (726 engineered features)")
    mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas())
    return


@app.cell
def _(features_jao, mo, pl, unified_jao):
    # Show actual column counts
    mo.md(f"""
    ### Dataset Column Counts

    **unified_jao**: {len(unified_jao.columns)} columns
    - Raw unified data (MaxBEX, LTA, NetPos)

    **features_jao**: {len(features_jao.columns)} columns  
    - Engineered features (726 + timestamp)
    """)

    # Show all column categories in features dataset
    tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
    tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
    lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
    temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
    target_cols = [c for c in features_jao.columns if c.startswith('target_')]

    feature_breakdown = pl.DataFrame({
        'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
        'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)]
    })

    mo.md("### Feature Breakdown in features_jao dataset:")
    mo.ui.table(feature_breakdown.to_pandas())

    # Show first 20 actual column names from features_jao
    mo.md("### First 20 column names in features_jao:")
    for i, col in enumerate(features_jao.columns[:]):
        print(f"{i+1:3d}. {col}")
    return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


@app.cell
def _(features_jao, mo, pl):
    # Check CNEC Tier-1 binding values without redefining variables
    _cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')]

    if _cnec_t1_binding_cols:
        # Show sample of binding values
        _sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20)
    
        mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)")
        mo.ui.table(_sample_bindings.to_pandas(), page_size=10)
    
        # Check unique values in first binding column
        _first_col = _cnec_t1_binding_cols[0]
        _unique_vals = features_jao[_first_col].unique().sort()
    
        mo.md(f"### Unique Values in {_first_col}")
        print(f"Unique values: {_unique_vals.to_list()}")
    
        # Value counts for first column
        _val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True)
        mo.ui.table(_val_counts.to_pandas())
    return


if __name__ == "__main__":
    app.run()