fbmc-chronos2 / scripts /validate_jao_update.py
Evgueni Poloukarov
feat: complete Phase 1 ENTSO-E asset-specific outage validation
27cb60a
"""Validate updated JAO data collection results.
Compares old vs new column selection and validates transformations.
"""
import sys
from pathlib import Path
import polars as pl
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
def main():
"""Validate updated JAO collection."""
print("\n" + "=" * 80)
print("JAO COLLECTION UPDATE VALIDATION")
print("=" * 80)
# Load updated data
updated_cnec = pl.read_parquet("data/raw/sample_updated/jao_cnec_sample.parquet")
updated_maxbex = pl.read_parquet("data/raw/sample_updated/jao_maxbex_sample.parquet")
updated_lta = pl.read_parquet("data/raw/sample_updated/jao_lta_sample.parquet")
# Load original data (if exists)
try:
original_cnec = pl.read_parquet("data/raw/sample/jao_cnec_sample.parquet")
has_original = True
except:
has_original = False
original_cnec = None
print("\n## 1. COLUMN COUNT COMPARISON")
print("-" * 80)
if has_original:
print(f"Original CNEC columns: {original_cnec.shape[1]}")
print(f"Updated CNEC columns: {updated_cnec.shape[1]}")
print(f"Reduction: {original_cnec.shape[1] - updated_cnec.shape[1]} columns removed")
print(f"Reduction %: {100 * (original_cnec.shape[1] - updated_cnec.shape[1]) / original_cnec.shape[1]:.1f}%")
else:
print(f"Updated CNEC columns: {updated_cnec.shape[1]}")
print("(Original data not available for comparison)")
print("\n## 2. NEW COLUMNS VALIDATION")
print("-" * 80)
new_cols_expected = ['fuaf', 'frm', 'shadow_price_log']
for col in new_cols_expected:
if col in updated_cnec.columns:
print(f"[OK] {col}: PRESENT")
# Stats
col_data = updated_cnec[col]
null_count = col_data.null_count()
null_pct = 100 * null_count / len(col_data)
print(f" - Records: {len(col_data)}")
print(f" - Nulls: {null_count} ({null_pct:.1f}%)")
print(f" - Min: {col_data.min():.4f}")
print(f" - Max: {col_data.max():.4f}")
print(f" - Mean: {col_data.mean():.4f}")
else:
print(f"[FAIL] {col}: MISSING")
print("\n## 3. REMOVED COLUMNS VALIDATION")
print("-" * 80)
removed_cols_expected = ['hubFrom', 'hubTo', 'f0all', 'amr', 'lta_margin']
all_removed = True
for col in removed_cols_expected:
if col in updated_cnec.columns:
print(f"[FAIL] {col}: STILL PRESENT (should be removed)")
all_removed = False
else:
print(f"[OK] {col}: Removed")
if all_removed:
print("\n[OK] All expected columns successfully removed")
print("\n## 4. SHADOW PRICE LOG TRANSFORM VALIDATION")
print("-" * 80)
if 'shadow_price' in updated_cnec.columns and 'shadow_price_log' in updated_cnec.columns:
sp = updated_cnec['shadow_price']
sp_log = updated_cnec['shadow_price_log']
print(f"Shadow price (original):")
print(f" - Range: [{sp.min():.2f}, {sp.max():.2f}] EUR/MW")
print(f" - 99th percentile: {sp.quantile(0.99):.2f} EUR/MW")
print(f" - Values >1000: {(sp > 1000).sum()} (should be uncapped)")
print(f"\nShadow price (log-transformed):")
print(f" - Range: [{sp_log.min():.4f}, {sp_log.max():.4f}]")
print(f" - Mean: {sp_log.mean():.4f}")
print(f" - Std: {sp_log.std():.4f}")
# Verify log transform correctness
import numpy as np
manual_log = (sp + 1).log()
max_diff = (sp_log - manual_log).abs().max()
if max_diff < 0.001:
print(f"\n[OK] Log transform verified correct (max diff: {max_diff:.6f})")
else:
print(f"\n[WARN] Log transform may have issues (max diff: {max_diff:.6f})")
print("\n## 5. DATA QUALITY CHECKS")
print("-" * 80)
# Check RAM clipping
if 'ram' in updated_cnec.columns and 'fmax' in updated_cnec.columns:
ram = updated_cnec['ram']
fmax = updated_cnec['fmax']
negative_ram = (ram < 0).sum()
ram_exceeds_fmax = (ram > fmax).sum()
print(f"RAM quality:")
print(f" - Negative values: {negative_ram} (should be 0)")
print(f" - RAM > fmax: {ram_exceeds_fmax} (should be 0)")
if negative_ram == 0 and ram_exceeds_fmax == 0:
print(f" [OK] RAM properly clipped to [0, fmax]")
else:
print(f" [WARN] RAM clipping may have issues")
# Check PTDF clipping
ptdf_cols = [col for col in updated_cnec.columns if col.startswith('ptdf_')]
if ptdf_cols:
ptdf_issues = 0
for col in ptdf_cols:
ptdf_data = updated_cnec[col]
out_of_range = ((ptdf_data < -1.5) | (ptdf_data > 1.5)).sum()
if out_of_range > 0:
ptdf_issues += 1
print(f"\nPTDF quality:")
print(f" - Columns checked: {len(ptdf_cols)}")
print(f" - Columns with out-of-range values: {ptdf_issues}")
if ptdf_issues == 0:
print(f" [OK] All PTDFs properly clipped to [-1.5, +1.5]")
else:
print(f" [WARN] Some PTDFs have out-of-range values")
print("\n## 6. LTA DATA VALIDATION")
print("-" * 80)
print(f"LTA records: {updated_lta.shape[0]}")
print(f"LTA columns: {updated_lta.shape[1]}")
print(f"LTA columns: {', '.join(updated_lta.columns[:10])}...")
# Check if LTA has actual data (not all zeros)
numeric_cols = [col for col in updated_lta.columns
if updated_lta[col].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
if numeric_cols:
# Check if any numeric column has non-zero values
has_data = False
for col in numeric_cols[:5]: # Check first 5 numeric columns
if updated_lta[col].sum() != 0:
has_data = True
break
if has_data:
print(f"[OK] LTA contains actual allocation data")
else:
print(f"[WARN] LTA data may be all zeros")
print("\n## 7. FILE SIZE COMPARISON")
print("-" * 80)
updated_cnec_size = Path("data/raw/sample_updated/jao_cnec_sample.parquet").stat().st_size
updated_maxbex_size = Path("data/raw/sample_updated/jao_maxbex_sample.parquet").stat().st_size
updated_lta_size = Path("data/raw/sample_updated/jao_lta_sample.parquet").stat().st_size
print(f"Updated CNEC file: {updated_cnec_size / 1024:.1f} KB")
print(f"Updated MaxBEX file: {updated_maxbex_size / 1024:.1f} KB")
print(f"Updated LTA file: {updated_lta_size / 1024:.1f} KB")
print(f"Total: {(updated_cnec_size + updated_maxbex_size + updated_lta_size) / 1024:.1f} KB")
if has_original:
original_cnec_size = Path("data/raw/sample/jao_cnec_sample.parquet").stat().st_size
reduction = 100 * (original_cnec_size - updated_cnec_size) / original_cnec_size
print(f"\nCNEC file size reduction: {reduction:.1f}%")
print("\n" + "=" * 80)
print("VALIDATION COMPLETE")
print("=" * 80)
if __name__ == "__main__":
main()