Spaces:

evgueni-p
/

fbmc-chronos2

Sleeping

Evgueni Poloukarov Claude commited on 23 days ago

Commit

a57b996

1 Parent(s): 7f2c237

feat: add 396 volatility features for zero-shot forecast improvement

Added per-border volatility metrics to help model capture hour-to-hour variations:
- 132 hour-over-hour delta features (target_delta_h1_*)
- 132 24-hour rolling volatility features (target_vol_24h_*)
- 132 6-hour range features (target_range_6h_*)

Total features: 2,647 -> 3,043 columns
Dataset uploaded to HuggingFace: evgueni-p/fbmc-features-24month

Initial evaluation: NO improvement (0.0% MAE change)
Likely cause: Chronos-2 ignores target-derived features or feature explosion

Co-Authored-By: Claude <[email protected]>

Files changed (2) hide show

src/feature_engineering/engineer_jao_features.py +42 -0
upload_to_hf.py +7 -7

src/feature_engineering/engineer_jao_features.py CHANGED Viewed

@@ -618,6 +618,48 @@ def engineer_jao_features(
             unified[col].alias(target_name)
         ])
     # Remove duplicates if any
     if 'mtu_right' in all_features.columns:
         all_features = all_features.drop([c for c in all_features.columns if c.endswith('_right')])

             unified[col].alias(target_name)
         ])
+    print(f"\n[INFO] Adding volatility features for {len(directional_cols)} directional borders...")
+    print("[INFO] Volatility features will help model capture hour-to-hour variations")
+    # Add volatility features for EACH directional border
+    # These features teach the model about hourly change patterns and volatility magnitude
+    for col in sorted(directional_cols):
+        from_country, to_country = col.split('>')
+        border_code = f'{from_country}_{to_country}'
+        # Get the target series
+        target_col = f'target_border_{border_code}'
+        # 1. Hour-over-hour delta (captures immediate hourly changes)
+        #    Formula: target[t] - target[t-1]
+        #    Helps model learn rapid swings and ramps
+        delta_col = f'target_delta_h1_{border_code}'
+        all_features = all_features.with_columns([
+            (all_features[target_col] - all_features[target_col].shift(1)).alias(delta_col)
+        ])
+        # 2. 24-hour rolling volatility (captures daily volatility patterns)
+        #    Formula: rolling_std(target, window=24h)
+        #    Informs model about magnitude of daily variation
+        vol_col = f'target_vol_24h_{border_code}'
+        all_features = all_features.with_columns([
+            all_features[target_col].rolling_std(window_size=24, min_periods=1).alias(vol_col)
+        ])
+        # 3. 6-hour range (captures intraday swings - morning/evening ramps)
+        #    Formula: rolling_max(target, 6h) - rolling_min(target, 6h)
+        #    Detects peak/off-peak transitions
+        range_col = f'target_range_6h_{border_code}'
+        all_features = all_features.with_columns([
+            (all_features[target_col].rolling_max(window_size=6, min_periods=1) -
+             all_features[target_col].rolling_min(window_size=6, min_periods=1)).alias(range_col)
+        ])
+    print(f"[OK] Added {len(directional_cols) * 3} volatility features:")
+    print(f"     - {len(directional_cols)} hour-over-hour delta features")
+    print(f"     - {len(directional_cols)} 24-hour rolling volatility features")
+    print(f"     - {len(directional_cols)} 6-hour range features")
     # Remove duplicates if any
     if 'mtu_right' in all_features.columns:
         all_features = all_features.drop([c for c in all_features.columns if c.endswith('_right')])

upload_to_hf.py CHANGED Viewed

@@ -37,16 +37,16 @@ def upload_extended_dataset():
     login(token=hf_token)
     print("  [OK] Logged in")
-    # Load extended dataset
-    extended_file = Path("data/processed/features_unified_extended.parquet")
-    if not extended_file.exists():
-        raise FileNotFoundError(f"Extended dataset not found: {extended_file}")
-    print(f"\nLoading extended dataset...")
-    df = pl.read_parquet(extended_file)
     print(f"  Shape: {df.shape}")
     print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
-    print(f"  File size: {extended_file.stat().st_size / 1024 / 1024:.1f} MB")
     # Convert to HuggingFace Dataset
     print("\nConverting to HuggingFace Dataset format...")

     login(token=hf_token)
     print("  [OK] Logged in")
+    # Load unified dataset with volatility features
+    unified_file = Path("data/processed/features_unified_24month.parquet")
+    if not unified_file.exists():
+        raise FileNotFoundError(f"Unified dataset not found: {unified_file}")
+    print(f"\nLoading unified dataset with volatility features...")
+    df = pl.read_parquet(unified_file)
     print(f"  Shape: {df.shape}")
     print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
+    print(f"  File size: {unified_file.stat().st_size / 1024 / 1024:.1f} MB")
     # Convert to HuggingFace Dataset
     print("\nConverting to HuggingFace Dataset format...")