Spaces:
Sleeping
Sleeping
File size: 5,131 Bytes
7aa0336 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""
Collect 24-Month Weather Data from OpenMeteo
=============================================
Collects hourly weather data from OpenMeteo Historical API for the full
24-month period (Oct 2023 - Sept 2025) across 52 strategic grid points.
7 Weather Variables:
- temperature_2m: Air temperature at 2m (C)
- windspeed_10m: Wind speed at 10m (m/s)
- windspeed_100m: Wind speed at 100m (m/s) - for wind generation
- winddirection_100m: Wind direction at 100m (degrees)
- shortwave_radiation: Solar radiation (W/m2) - for solar generation
- cloudcover: Cloud cover percentage
- surface_pressure: Surface air pressure (hPa)
Collection Strategy:
- 52 grid points (covering all FBMC zones + neighbors)
- 2-week chunks (1.0 API call each)
- 270 requests/minute (45% of 600 limit)
- Estimated runtime: ~5 minutes
Output: data/raw/weather_24month.parquet
Size: ~50-80 MB (52 points × 7 vars × 17,520 hours)
Features: 364 (52 × 7) when engineered
"""
import sys
from pathlib import Path
# Add src to path
sys.path.append(str(Path(__file__).parent.parent))
from src.data_collection.collect_openmeteo import OpenMeteoCollector
# Date range: Oct 2023 - Sept 2025 (24 months)
START_DATE = '2023-10-01'
END_DATE = '2025-09-30'
# Output file
OUTPUT_DIR = Path(__file__).parent.parent / 'data' / 'raw'
OUTPUT_FILE = OUTPUT_DIR / 'weather_24month.parquet'
print("="*80)
print("24-MONTH WEATHER DATA COLLECTION")
print("="*80)
print()
print("Period: October 2023 - September 2025 (24 months)")
print("Grid points: 52 strategic locations across FBMC")
print("Variables: 7 weather parameters")
print("Estimated runtime: ~5 minutes")
print()
# Initialize collector with safe rate limiting
print("Initializing OpenMeteo collector...")
collector = OpenMeteoCollector(
requests_per_minute=270, # 45% of 600 limit
chunk_days=14 # 1.0 API call per request
)
print("[OK] Collector initialized")
print()
# Run collection
try:
df = collector.collect_all(
start_date=START_DATE,
end_date=END_DATE,
output_path=OUTPUT_FILE
)
if not df.is_empty():
print()
print("="*80)
print("COLLECTION SUCCESS")
print("="*80)
print()
print(f"Output: {OUTPUT_FILE}")
print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]} columns")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Grid points: {df['grid_point'].n_unique()}")
print(f"Weather variables: {len([c for c in df.columns if c not in ['timestamp', 'grid_point', 'location_name', 'latitude', 'longitude']])}")
print()
# Data quality summary
null_count_total = df.null_count().sum_horizontal()[0]
null_pct = (null_count_total / (df.shape[0] * df.shape[1])) * 100
print(f"Data completeness: {100 - null_pct:.2f}%")
if null_pct > 0:
print()
print("Missing data by column:")
for col in df.columns:
null_count = df[col].null_count()
if null_count > 0:
pct = (null_count / len(df)) * 100
print(f" - {col}: {null_count:,} ({pct:.2f}%)")
print()
print("="*80)
print("NEXT STEPS")
print("="*80)
print()
print("1. Implement weather feature engineering:")
print(" - Create src/feature_engineering/engineer_weather_features.py")
print(" - Engineer ~364 features (52 grid points x 7 variables)")
print(" - Add spatial aggregation (zone-level averages)")
print()
print("2. Expected features:")
print(" - Grid-level: temp_{grid_point}, wind_{grid_point}, solar_{grid_point}, etc.")
print(" - Zone-level: temp_avg_{zone}, wind_avg_{zone}, solar_avg_{zone}, etc.")
print(" - Lags: Previous 1h, 6h, 12h, 24h for key variables")
print()
print("3. Final unified features:")
print(" - JAO: 1,698")
print(" - ENTSO-E: 296")
print(" - Weather: 364")
print(" - Total: ~2,358 features")
print()
print("[OK] Weather data collection COMPLETE!")
else:
print()
print("[ERROR] No weather data collected")
print()
print("Possible causes:")
print(" - OpenMeteo API access issues")
print(" - Rate limit exceeded")
print(" - Network connectivity problems")
print()
sys.exit(1)
except KeyboardInterrupt:
print()
print()
print("="*80)
print("COLLECTION INTERRUPTED")
print("="*80)
print()
print("Collection was stopped by user.")
print()
print("NOTE: OpenMeteo collection does NOT have checkpoint/resume capability")
print(" (collection completes in ~5 minutes, so not needed)")
print()
print("To restart: Run this script again")
print()
sys.exit(130)
except Exception as e:
print()
print()
print("="*80)
print("COLLECTION FAILED")
print("="*80)
print()
print(f"Error: {e}")
print()
import traceback
traceback.print_exc()
print()
sys.exit(1)
|