File size: 19,463 Bytes
27cb60a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
"""FBMC Flow Forecasting - Unified JAO Data Exploration

Objective: Explore unified 24-month JAO data and engineered features

This notebook explores:
1. Unified JAO dataset (MaxBEX + CNEC + LTA + NetPos)
2. Engineered features (726 features across 5 categories)
3. Feature completeness and validation
4. Key statistics and distributions

Usage:
    marimo edit notebooks/02_unified_jao_exploration.py
"""

import marimo

__generated_with = "0.17.2"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import polars as pl
    import altair as alt
    from pathlib import Path
    import numpy as np
    return Path, alt, mo, pl


@app.cell
def _(mo):
    mo.md(
        r"""
    # Unified JAO Data Exploration (24 Months)

    **Date Range**: October 2023 - October 2025 (24 months)

    ## Data Pipeline Overview:

    1. **Raw JAO Data** (4 datasets)
       - MaxBEX: Maximum Bilateral Exchange capacity (TARGET)
       - CNEC/PTDF: Critical constraints with power transfer factors
       - LTA: Long Term Allocations (future covariates)
       - Net Positions: Domain boundaries (min/max per zone)

    2. **Data Unification** → `unified_jao_24month.parquet`
       - Deduplicated NetPos (removed 1,152 duplicate timestamps)
       - Forward-filled LTA gaps (710 missing hours)
       - Broadcast daily CNEC to hourly
       - Sorted timeline (hourly, 17,544 records)

    3. **Feature Engineering** → `features_jao_24month.parquet`
       - 726 features across 5 categories
       - Tier-1 CNEC: 274 features
       - Tier-2 CNEC: 390 features
       - LTA: 40 features
       - Temporal: 12 features
       - Targets: 10 features
    """
    )
    return


@app.cell
def _(Path, pl):
    # Load unified datasets
    print("Loading unified JAO datasets...")

    processed_dir = Path('data/processed')

    unified_jao = pl.read_parquet(processed_dir / 'unified_jao_24month.parquet')
    cnec_hourly = pl.read_parquet(processed_dir / 'cnec_hourly_24month.parquet')
    features_jao = pl.read_parquet(processed_dir / 'features_jao_24month.parquet')

    print(f"[OK] Unified JAO: {unified_jao.shape}")
    print(f"[OK] CNEC hourly: {cnec_hourly.shape}")
    print(f"[OK] Features: {features_jao.shape}")
    return features_jao, unified_jao


@app.cell
def _(features_jao, mo, unified_jao):
    # Dataset overview
    mo.md(f"""
    ## Dataset Overview

    ### 1. Unified JAO Dataset
    - **Shape**: {unified_jao.shape[0]:,} rows × {unified_jao.shape[1]} columns
    - **Date Range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
    - **Timeline Sorted**: {unified_jao['mtu'].is_sorted()}
    - **Null Percentage**: {(unified_jao.null_count().sum_horizontal()[0] / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%

    ### 2. Engineered Features
    - **Shape**: {features_jao.shape[0]:,} rows × {features_jao.shape[1]} columns
    - **Total Features**: {features_jao.shape[1] - 1} (excluding mtu timestamp)
    - **Null Percentage**: {(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%
      - _Note: High nulls expected due to sparse CNEC binding patterns and lag features_
    """)
    return


@app.cell
def _(mo):
    mo.md("""## 1. Unified JAO Dataset Structure""")
    return


@app.cell
def _(mo, unified_jao):
    # Show sample of unified data
    mo.md("""### Sample Data (First 20 Rows)""")
    mo.ui.table(unified_jao.head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(mo, unified_jao):
    # Column breakdown
    maxbex_cols = [c for c in unified_jao.columns if 'border_' in c and not c.startswith('lta')]
    lta_cols = [c for c in unified_jao.columns if c.startswith('border_')]
    netpos_cols = [c for c in unified_jao.columns if c.startswith('netpos_')]

    mo.md(f"""
    ### Column Breakdown

    - **Timestamp**: 1 column (`mtu`)
    - **MaxBEX Borders**: {len(maxbex_cols)} columns
    - **LTA Borders**: {len(lta_cols)} columns
    - **Net Positions**: {len(netpos_cols)} columns (if present)
    - **Total**: {unified_jao.shape[1]} columns
    """)
    return


@app.cell
def _(mo):
    mo.md("""### Timeline Validation""")
    return


@app.cell
def _(alt, pl, unified_jao):
    # Timeline validation
    time_diffs = unified_jao['mtu'].diff().drop_nulls()

    # Most common time diff
    most_common = time_diffs.mode()[0]
    is_hourly = most_common.total_seconds() == 3600

    # Create histogram of time diffs
    time_diff_hours = time_diffs.map_elements(lambda x: x.total_seconds() / 3600, return_dtype=pl.Float64)

    time_diff_df = pl.DataFrame({
        'time_diff_hours': time_diff_hours
    })

    timeline_chart = alt.Chart(time_diff_df.to_pandas()).mark_bar().encode(
        x=alt.X('time_diff_hours:Q', bin=alt.Bin(maxbins=50), title='Time Difference (hours)'),
        y=alt.Y('count()', title='Count'),
        tooltip=['time_diff_hours:Q', 'count()']
    ).properties(
        title='Timeline Gaps Distribution',
        width=800,
        height=300
    )

    timeline_chart
    return is_hourly, most_common


@app.cell
def _(is_hourly, mo, most_common):
    if is_hourly:
        mo.md(f"""
        ✅ **Timeline Validation: PASS**
        - Most common time diff: {most_common} (1 hour)
        - Timeline is properly sorted and hourly
        """)
    else:
        mo.md(f"""
        ⚠️ **Timeline Validation: WARNING**
        - Most common time diff: {most_common}
        - Expected: 1 hour
        """)
    return


@app.cell
def _(mo):
    mo.md("""## 2. Feature Engineering Results""")
    return


@app.cell
def _(features_jao, mo, pl):
    # Feature category breakdown
    tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
    tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
    lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
    temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
    target_cols = [c for c in features_jao.columns if c.startswith('target_')]

    # Create summary table
    feature_summary = pl.DataFrame({
        'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
        'Features': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), features_jao.shape[1] - 1],
        'Null %': [
            f"{(features_jao.select(tier1_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier1_cols)) * 100):.2f}%" if tier1_cols else "N/A",
            f"{(features_jao.select(tier2_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(tier2_cols)) * 100):.2f}%" if tier2_cols else "N/A",
            f"{(features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(lta_feat_cols)) * 100):.2f}%" if lta_feat_cols else "N/A",
            f"{(features_jao.select(temporal_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(temporal_cols)) * 100):.2f}%" if temporal_cols else "N/A",
            f"{(features_jao.select(target_cols).null_count().sum_horizontal()[0] / (len(features_jao) * len(target_cols)) * 100):.2f}%" if target_cols else "N/A",
            f"{(features_jao.null_count().sum_horizontal()[0] / (len(features_jao) * len(features_jao.columns)) * 100):.2f}%"
        ]
    })

    mo.ui.table(feature_summary.to_pandas())
    return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


@app.cell
def _(mo):
    mo.md("""### Sample Features (First 20 Rows)""")
    return


@app.cell
def _(features_jao, mo):
    # Show first 10 columns only (too many to display all)
    mo.ui.table(features_jao.select(features_jao.columns[:10]).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(mo):
    mo.md("""## 3. LTA Features (Future Covariates)""")
    return


@app.cell
def _(lta_feat_cols, mo):
    # LTA features analysis
    mo.md(f"""
    **LTA Features**: {len(lta_feat_cols)} features

    LTA (Long Term Allocations) are **future covariates** - known years in advance via auctions.
    These should have **0% nulls** since they're available for the entire forecast horizon.
    """)
    return


@app.cell
def _(alt, features_jao):
    # Plot LTA total allocated over time
    lta_chart_data = features_jao.select(['mtu', 'lta_total_allocated']).sort('mtu')

    lta_chart = alt.Chart(lta_chart_data.to_pandas()).mark_line().encode(
        x=alt.X('mtu:T', title='Date'),
        y=alt.Y('lta_total_allocated:Q', title='Total LTA Allocated (MW)'),
        tooltip=['mtu:T', 'lta_total_allocated:Q']
    ).properties(
        title='LTA Total Allocated Capacity Over Time',
        width=800,
        height=400
    ).interactive()

    lta_chart
    return


@app.cell
def _(features_jao, lta_feat_cols, mo):
    # LTA statistics
    lta_stats = features_jao.select(lta_feat_cols[:5]).describe()

    mo.md("""### LTA Sample Statistics (First 5 Features)""")
    mo.ui.table(lta_stats.to_pandas())
    return


@app.cell
def _(mo):
    mo.md("""## 4. Temporal Features""")
    return


@app.cell
def _(features_jao, mo, temporal_cols):
    # Show temporal features
    mo.md(f"""
    **Temporal Features**: {len(temporal_cols)} features

    Cyclic encoding for hour, month, and weekday to capture periodicity.
    """)

    mo.ui.table(features_jao.select(['mtu'] + temporal_cols).head(24).to_pandas())
    return


@app.cell
def _(alt, features_jao, pl):
    # Hourly distribution
    hour_dist = features_jao.group_by('hour').agg(pl.len().alias('count')).sort('hour')

    hour_chart = alt.Chart(hour_dist.to_pandas()).mark_bar().encode(
        x=alt.X('hour:O', title='Hour of Day'),
        y=alt.Y('count:Q', title='Count'),
        tooltip=['hour:O', 'count:Q']
    ).properties(
        title='Distribution by Hour of Day',
        width=800,
        height=300
    )

    hour_chart
    return


@app.cell
def _(mo):
    mo.md("""## 5. CNEC Features (Historical)""")
    return


@app.cell
def _(features_jao, mo, tier1_cols, tier2_cols):
    # CNEC features overview
    mo.md(f"""
    **CNEC Features**: {len(tier1_cols) + len(tier2_cols)} total

    - **Tier-1 CNECs**: {len(tier1_cols)} features (top 58 most critical CNECs)
    - **Tier-2 CNECs**: {len(tier2_cols)} features (next 150 CNECs)

    High null percentage is **expected** due to:
    1. Sparse binding patterns (not all CNECs bind every hour)
    2. Lag features create nulls at timeline start
    3. Pivoting creates sparse constraint matrices
    """)

    # Sample Tier-1 features
    mo.ui.table(features_jao.select(['mtu'] + tier1_cols[:5]).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(alt, features_jao, pl, tier1_cols):
    # Binding frequency for sample Tier-1 CNECs
    binding_cols = [c for c in tier1_cols if 'binding_' in c][:10]

    if binding_cols:
        binding_freq = pl.DataFrame({
            'cnec': [c.replace('cnec_t1_binding_', '') for c in binding_cols],
            'binding_rate': [features_jao[c].mean() for c in binding_cols]
        })

        binding_chart = alt.Chart(binding_freq.to_pandas()).mark_bar().encode(
            x=alt.X('binding_rate:Q', title='Binding Frequency (0-1)'),
            y=alt.Y('cnec:N', sort='-x', title='CNEC'),
            tooltip=['cnec:N', alt.Tooltip('binding_rate:Q', format='.2%')]
        ).properties(
            title='Binding Frequency - Sample Tier-1 CNECs',
            width=800,
            height=300
        )

        binding_chart
    else:
        None
    return


@app.cell
def _(mo):
    mo.md("""## 6. Target Variables""")
    return


@app.cell
def _(features_jao, mo, target_cols):
    # Show target variables (MaxBEX borders)
    mo.md(f"""
    **Target Variables**: {len(target_cols)} features

    Sample MaxBEX borders for forecasting (first 10 borders):
    """)

    if target_cols:
        mo.ui.table(features_jao.select(['mtu'] + target_cols).head(20).to_pandas(), page_size=10)
    return


@app.cell
def _(alt, features_jao, target_cols):
    # Plot sample target variable over time
    if target_cols:
        sample_target = target_cols[0]

        target_chart_data = features_jao.select(['mtu', sample_target]).sort('mtu')

        target_chart = alt.Chart(target_chart_data.to_pandas()).mark_line().encode(
            x=alt.X('mtu:T', title='Date'),
            y=alt.Y(f'{sample_target}:Q', title='Capacity (MW)'),
            tooltip=['mtu:T', f'{sample_target}:Q']
        ).properties(
            title=f'Target Variable Over Time: {sample_target}',
            width=800,
            height=400
        ).interactive()

        target_chart
    else:
        None
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## 7. Data Quality Summary

    Final validation checks:
    """
    )
    return


@app.cell
def _(features_jao, is_hourly, lta_feat_cols, mo, pl, unified_jao):
    # Data quality checks
    checks = []

    # Check 1: Timeline sorted and hourly
    checks.append({
        'Check': 'Timeline sorted & hourly',
        'Status': 'PASS' if is_hourly else 'FAIL',
        'Details': f'Most common diff: {unified_jao["mtu"].diff().drop_nulls().mode()[0]}'
    })

    # Check 2: No nulls in unified dataset
    unified_nulls = unified_jao.null_count().sum_horizontal()[0]
    checks.append({
        'Check': 'Unified data completeness',
        'Status': 'PASS' if unified_nulls == 0 else 'WARNING',
        'Details': f'{unified_nulls} nulls ({(unified_nulls / (len(unified_jao) * len(unified_jao.columns)) * 100):.2f}%)'
    })

    # Check 3: LTA features have no nulls (future covariates)
    lta_nulls = features_jao.select(lta_feat_cols).null_count().sum_horizontal()[0] if lta_feat_cols else 0
    checks.append({
        'Check': 'LTA future covariates complete',
        'Status': 'PASS' if lta_nulls == 0 else 'FAIL',
        'Details': f'{lta_nulls} nulls in {len(lta_feat_cols)} LTA features'
    })

    # Check 4: Data consistency (same row count)
    checks.append({
        'Check': 'Data consistency',
        'Status': 'PASS' if len(unified_jao) == len(features_jao) else 'FAIL',
        'Details': f'Unified: {len(unified_jao):,} rows, Features: {len(features_jao):,} rows'
    })

    checks_df = pl.DataFrame(checks)

    mo.ui.table(checks_df.to_pandas())
    return (checks,)


@app.cell
def _(checks, mo):
    # Overall status
    all_pass = all(c['Status'] == 'PASS' for c in checks)

    if all_pass:
        mo.md("""
        ✅ **All validation checks PASSED**

        Data is ready for model training and inference!
        """)
    else:
        failed = [c['Check'] for c in checks if c['Status'] == 'FAIL']
        warnings = [c['Check'] for c in checks if c['Status'] == 'WARNING']

        status = "⚠️ **Some checks failed or have warnings**\n\n"
        if failed:
            status += f"**Failed**: {', '.join(failed)}\n\n"
        if warnings:
            status += f"**Warnings**: {', '.join(warnings)}"

        mo.md(status)
    return


@app.cell
def _(mo):
    mo.md(
        """
    ## Next Steps

    ✅ **JAO Data Collection & Unification: COMPLETE**
    - 24 months of data (Oct 2023 - Oct 2025)
    - 17,544 hourly records
    - 726 features engineered

    **Remaining Work:**
    1. Collect weather data (OpenMeteo, 52 grid points)
    2. Collect ENTSO-E data (generation, flows, outages)
    3. Complete remaining feature scaffolding (NetPos lags, MaxBEX lags, system aggregates)
    4. Integrate all data sources
    5. Begin zero-shot Chronos 2 inference

    ---

    **Data Files**:
    - `data/processed/unified_jao_24month.parquet` (5.59 MB)
    - `data/processed/cnec_hourly_24month.parquet` (4.57 MB)
    - `data/processed/features_jao_24month.parquet` (0.60 MB)
    """
    )
    return


@app.cell
def _(mo, unified_jao):
    # Display the unified JAO dataset
    mo.md("## Unified JAO Dataset")
    mo.ui.table(unified_jao.to_pandas(), page_size=20)
    return


@app.cell
def _(features_jao, mo, unified_jao):
    # Show the actual structure with timestamp
    mo.md("### Unified JAO Dataset Structure")
    display_df = unified_jao.select(['mtu'] + [c for c in unified_jao.columns if c != 'mtu'][:10]).head(10)
    mo.ui.table(display_df.to_pandas())

    mo.md(f"""
    **Dataset Info:**
    - **Total columns**: {len(unified_jao.columns)}
    - **Timestamp column**: `mtu` (Market Time Unit)
    - **Date range**: {unified_jao['mtu'].min()} to {unified_jao['mtu'].max()}
    """)

    # Show the 726 features dataset separately
    mo.md("### Features Dataset (726 engineered features)")
    mo.ui.table(features_jao.select(['mtu'] + features_jao.columns[1:11]).head(10).to_pandas())
    return


@app.cell
def _(features_jao, mo, pl, unified_jao):
    # Show actual column counts
    mo.md(f"""
    ### Dataset Column Counts

    **unified_jao**: {len(unified_jao.columns)} columns
    - Raw unified data (MaxBEX, LTA, NetPos)

    **features_jao**: {len(features_jao.columns)} columns  
    - Engineered features (726 + timestamp)
    """)

    # Show all column categories in features dataset
    tier1_cols = [c for c in features_jao.columns if c.startswith('cnec_t1_')]
    tier2_cols = [c for c in features_jao.columns if c.startswith('cnec_t2_')]
    lta_feat_cols = [c for c in features_jao.columns if c.startswith('lta_')]
    temporal_cols = [c for c in features_jao.columns if c in ['hour', 'day', 'month', 'weekday', 'year', 'is_weekend', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'weekday_sin', 'weekday_cos']]
    target_cols = [c for c in features_jao.columns if c.startswith('target_')]

    feature_breakdown = pl.DataFrame({
        'Category': ['Tier-1 CNEC', 'Tier-2 CNEC', 'LTA', 'Temporal', 'Targets', 'TOTAL'],
        'Count': [len(tier1_cols), len(tier2_cols), len(lta_feat_cols), len(temporal_cols), len(target_cols), len(features_jao.columns)]
    })

    mo.md("### Feature Breakdown in features_jao dataset:")
    mo.ui.table(feature_breakdown.to_pandas())

    # Show first 20 actual column names from features_jao
    mo.md("### First 20 column names in features_jao:")
    for i, col in enumerate(features_jao.columns[:]):
        print(f"{i+1:3d}. {col}")
    return lta_feat_cols, target_cols, temporal_cols, tier1_cols, tier2_cols


@app.cell
def _(features_jao, mo, pl):
    # Check CNEC Tier-1 binding values without redefining variables
    _cnec_t1_binding_cols = [c for c in features_jao.columns if c.startswith('target_border')]

    if _cnec_t1_binding_cols:
        # Show sample of binding values
        _sample_bindings = features_jao.select(['mtu'] + _cnec_t1_binding_cols[:5]).head(20)
    
        mo.md("### Sample CNEC Tier-1 Binding Values (First 5 CNECs)")
        mo.ui.table(_sample_bindings.to_pandas(), page_size=10)
    
        # Check unique values in first binding column
        _first_col = _cnec_t1_binding_cols[0]
        _unique_vals = features_jao[_first_col].unique().sort()
    
        mo.md(f"### Unique Values in {_first_col}")
        print(f"Unique values: {_unique_vals.to_list()}")
    
        # Value counts for first column
        _val_counts = features_jao.group_by(_first_col).agg(pl.len().alias('count')).sort('count', descending=True)
        mo.ui.table(_val_counts.to_pandas())
    return


if __name__ == "__main__":
    app.run()