sebastiansarasti commited on
Commit
07e4611
·
1 Parent(s): 27c5c44

FEATURE: Adding the secretaria de quito data

Browse files
.gitignore CHANGED
@@ -52,7 +52,7 @@ htmlcov/
52
  *.db
53
  *.mo
54
 
55
- *.parquet
56
  *.xlsx
57
  *.json
58
  **/AutogluonModels/**
 
52
  *.db
53
  *.mo
54
 
55
+
56
  *.xlsx
57
  *.json
58
  **/AutogluonModels/**
notebooks/01_benchmark.ipynb CHANGED
@@ -37,53 +37,9 @@
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 197,
41
  "metadata": {},
42
- "outputs": [
43
- {
44
- "ename": "ConnectTimeout",
45
- "evalue": "HTTPSConnectionPool(host='www.kaggle.com', port=443): Max retries exceeded with url: /api/v1/datasets/view/rickandjoe/electricity-transformer-dataset-etdataset (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x34a15e910>, 'Connection to www.kaggle.com timed out. (connect timeout=5)'))",
46
- "output_type": "error",
47
- "traceback": [
48
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
49
- "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)",
50
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connection.py:198\u001b[39m, in \u001b[36mHTTPConnection._new_conn\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 197\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m198\u001b[39m sock = \u001b[43mconnection\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcreate_connection\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 199\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_dns_host\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mport\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 200\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 201\u001b[39m \u001b[43m \u001b[49m\u001b[43msource_address\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msource_address\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 202\u001b[39m \u001b[43m \u001b[49m\u001b[43msocket_options\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msocket_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 203\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 204\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m socket.gaierror \u001b[38;5;28;01mas\u001b[39;00m e:\n",
51
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/util/connection.py:85\u001b[39m, in \u001b[36mcreate_connection\u001b[39m\u001b[34m(address, timeout, source_address, socket_options)\u001b[39m\n\u001b[32m 84\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m85\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m err\n\u001b[32m 86\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 87\u001b[39m \u001b[38;5;66;03m# Break explicitly a reference cycle\u001b[39;00m\n",
52
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/util/connection.py:73\u001b[39m, in \u001b[36mcreate_connection\u001b[39m\u001b[34m(address, timeout, source_address, socket_options)\u001b[39m\n\u001b[32m 72\u001b[39m sock.bind(source_address)\n\u001b[32m---> \u001b[39m\u001b[32m73\u001b[39m \u001b[43msock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[43msa\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 74\u001b[39m \u001b[38;5;66;03m# Break explicitly a reference cycle\u001b[39;00m\n",
53
- "\u001b[31mTimeoutError\u001b[39m: timed out",
54
- "\nThe above exception was the direct cause of the following exception:\n",
55
- "\u001b[31mConnectTimeoutError\u001b[39m Traceback (most recent call last)",
56
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connectionpool.py:787\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 786\u001b[39m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m787\u001b[39m response = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 788\u001b[39m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 789\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 790\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 791\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 792\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 793\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 794\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 795\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 796\u001b[39m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 797\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 798\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 799\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 800\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 802\u001b[39m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n",
57
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connectionpool.py:488\u001b[39m, in \u001b[36mHTTPConnectionPool._make_request\u001b[39m\u001b[34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[39m\n\u001b[32m 487\u001b[39m new_e = _wrap_proxy_error(new_e, conn.proxy.scheme)\n\u001b[32m--> \u001b[39m\u001b[32m488\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m new_e\n\u001b[32m 490\u001b[39m \u001b[38;5;66;03m# conn.request() calls http.client.*.request, not the method in\u001b[39;00m\n\u001b[32m 491\u001b[39m \u001b[38;5;66;03m# urllib3.request. It also calls makefile (recv) on the socket.\u001b[39;00m\n",
58
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connectionpool.py:464\u001b[39m, in \u001b[36mHTTPConnectionPool._make_request\u001b[39m\u001b[34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[39m\n\u001b[32m 463\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m464\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_validate_conn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 465\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (SocketTimeout, BaseSSLError) \u001b[38;5;28;01mas\u001b[39;00m e:\n",
59
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connectionpool.py:1093\u001b[39m, in \u001b[36mHTTPSConnectionPool._validate_conn\u001b[39m\u001b[34m(self, conn)\u001b[39m\n\u001b[32m 1092\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m conn.is_closed:\n\u001b[32m-> \u001b[39m\u001b[32m1093\u001b[39m \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1095\u001b[39m \u001b[38;5;66;03m# TODO revise this, see https://github.com/urllib3/urllib3/issues/2791\u001b[39;00m\n",
60
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connection.py:753\u001b[39m, in \u001b[36mHTTPSConnection.connect\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 752\u001b[39m sock: socket.socket | ssl.SSLSocket\n\u001b[32m--> \u001b[39m\u001b[32m753\u001b[39m \u001b[38;5;28mself\u001b[39m.sock = sock = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_new_conn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 754\u001b[39m server_hostname: \u001b[38;5;28mstr\u001b[39m = \u001b[38;5;28mself\u001b[39m.host\n",
61
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connection.py:207\u001b[39m, in \u001b[36mHTTPConnection._new_conn\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 206\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m207\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ConnectTimeoutError(\n\u001b[32m 208\u001b[39m \u001b[38;5;28mself\u001b[39m,\n\u001b[32m 209\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mConnection to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.host\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m timed out. (connect timeout=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m.timeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m,\n\u001b[32m 210\u001b[39m ) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 212\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
62
- "\u001b[31mConnectTimeoutError\u001b[39m: (<urllib3.connection.HTTPSConnection object at 0x34a15e910>, 'Connection to www.kaggle.com timed out. (connect timeout=5)')",
63
- "\nThe above exception was the direct cause of the following exception:\n",
64
- "\u001b[31mMaxRetryError\u001b[39m Traceback (most recent call last)",
65
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/adapters.py:644\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 643\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m644\u001b[39m resp = \u001b[43mconn\u001b[49m\u001b[43m.\u001b[49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 647\u001b[39m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m.\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 650\u001b[39m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 651\u001b[39m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 652\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 653\u001b[39m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 654\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 655\u001b[39m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m=\u001b[49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 656\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 658\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n",
66
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/connectionpool.py:841\u001b[39m, in \u001b[36mHTTPConnectionPool.urlopen\u001b[39m\u001b[34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[39m\n\u001b[32m 839\u001b[39m new_e = ProtocolError(\u001b[33m\"\u001b[39m\u001b[33mConnection aborted.\u001b[39m\u001b[33m\"\u001b[39m, new_e)\n\u001b[32m--> \u001b[39m\u001b[32m841\u001b[39m retries = \u001b[43mretries\u001b[49m\u001b[43m.\u001b[49m\u001b[43mincrement\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 842\u001b[39m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnew_e\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_pool\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_stacktrace\u001b[49m\u001b[43m=\u001b[49m\u001b[43msys\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexc_info\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 843\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 844\u001b[39m retries.sleep()\n",
67
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/urllib3/util/retry.py:519\u001b[39m, in \u001b[36mRetry.increment\u001b[39m\u001b[34m(self, method, url, response, error, _pool, _stacktrace)\u001b[39m\n\u001b[32m 518\u001b[39m reason = error \u001b[38;5;129;01mor\u001b[39;00m ResponseError(cause)\n\u001b[32m--> \u001b[39m\u001b[32m519\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m MaxRetryError(_pool, url, reason) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mreason\u001b[39;00m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[32m 521\u001b[39m log.debug(\u001b[33m\"\u001b[39m\u001b[33mIncremented Retry for (url=\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m): \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[33m\"\u001b[39m, url, new_retry)\n",
68
- "\u001b[31mMaxRetryError\u001b[39m: HTTPSConnectionPool(host='www.kaggle.com', port=443): Max retries exceeded with url: /api/v1/datasets/view/rickandjoe/electricity-transformer-dataset-etdataset (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x34a15e910>, 'Connection to www.kaggle.com timed out. (connect timeout=5)'))",
69
- "\nDuring handling of the above exception, another exception occurred:\n",
70
- "\u001b[31mConnectTimeout\u001b[39m Traceback (most recent call last)",
71
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[197]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m path = \u001b[43mkagglehub\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdataset_download\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrickandjoe/electricity-transformer-dataset-etdataset\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
72
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/datasets.py:43\u001b[39m, in \u001b[36mdataset_download\u001b[39m\u001b[34m(handle, path, force_download)\u001b[39m\n\u001b[32m 41\u001b[39m h = parse_dataset_handle(handle)\n\u001b[32m 42\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownloading Dataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mh.to_url()\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m ...\u001b[39m\u001b[33m\"\u001b[39m, extra={**EXTRA_CONSOLE_BLOCK})\n\u001b[32m---> \u001b[39m\u001b[32m43\u001b[39m path, _ = \u001b[43mregistry\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdataset_resolver\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 44\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m path\n",
73
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/registry.py:28\u001b[39m, in \u001b[36mMultiImplRegistry.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 26\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m impl \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mreversed\u001b[39m(\u001b[38;5;28mself\u001b[39m._impls):\n\u001b[32m 27\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m impl.is_supported(*args, **kwargs):\n\u001b[32m---> \u001b[39m\u001b[32m28\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimpl\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 29\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 30\u001b[39m fails.append(\u001b[38;5;28mtype\u001b[39m(impl).\u001b[34m__name__\u001b[39m)\n",
74
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/resolver.py:29\u001b[39m, in \u001b[36mResolver.__call__\u001b[39m\u001b[34m(self, handle, path, force_download)\u001b[39m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__call__\u001b[39m(\n\u001b[32m 16\u001b[39m \u001b[38;5;28mself\u001b[39m, handle: T, path: Optional[\u001b[38;5;28mstr\u001b[39m] = \u001b[38;5;28;01mNone\u001b[39;00m, *, force_download: Optional[\u001b[38;5;28mbool\u001b[39m] = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 17\u001b[39m ) -> \u001b[38;5;28mtuple\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Optional[\u001b[38;5;28mint\u001b[39m]]:\n\u001b[32m 18\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Resolves a handle into a path with the requested file(s) and the resource's version number.\u001b[39;00m\n\u001b[32m 19\u001b[39m \n\u001b[32m 20\u001b[39m \u001b[33;03m Args:\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 27\u001b[39m \u001b[33;03m Some cases where version number might be missing: Competition datasource, API-based models.\u001b[39;00m\n\u001b[32m 28\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m29\u001b[39m path, version = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_resolve\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 31\u001b[39m \u001b[38;5;66;03m# Note handles are immutable, so _resolve() could not have altered our reference\u001b[39;00m\n\u001b[32m 32\u001b[39m register_datasource_access(handle, version)\n",
75
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/http_resolver.py:107\u001b[39m, in \u001b[36mDatasetHttpResolver._resolve\u001b[39m\u001b[34m(self, h, path, force_download)\u001b[39m\n\u001b[32m 104\u001b[39m api_client = KaggleApiV1Client()\n\u001b[32m 106\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m h.is_versioned():\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m h = h.with_version(\u001b[43m_get_current_version\u001b[49m\u001b[43m(\u001b[49m\u001b[43mapi_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m 109\u001b[39m dataset_path = load_from_cache(h, path)\n\u001b[32m 110\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m dataset_path \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m force_download:\n",
76
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/http_resolver.py:290\u001b[39m, in \u001b[36m_get_current_version\u001b[39m\u001b[34m(api_client, h)\u001b[39m\n\u001b[32m 287\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m json_response[MODEL_INSTANCE_VERSION_FIELD]\n\u001b[32m 289\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(h, DatasetHandle):\n\u001b[32m--> \u001b[39m\u001b[32m290\u001b[39m json_response = \u001b[43mapi_client\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_build_get_dataset_url_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mh\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 291\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m DATASET_CURRENT_VERSION_FIELD \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m json_response:\n\u001b[32m 292\u001b[39m msg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mInvalid GetDataset API response. Expected to include a \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATASET_CURRENT_VERSION_FIELD\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m field\u001b[39m\u001b[33m\"\u001b[39m\n",
77
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/kagglehub/clients.py:133\u001b[39m, in \u001b[36mKaggleApiV1Client.get\u001b[39m\u001b[34m(self, path, resource_handle)\u001b[39m\n\u001b[32m 131\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget\u001b[39m(\u001b[38;5;28mself\u001b[39m, path: \u001b[38;5;28mstr\u001b[39m, resource_handle: Optional[ResourceHandle] = \u001b[38;5;28;01mNone\u001b[39;00m) -> \u001b[38;5;28mdict\u001b[39m:\n\u001b[32m 132\u001b[39m url = \u001b[38;5;28mself\u001b[39m._build_url(path)\n\u001b[32m--> \u001b[39m\u001b[32m133\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mrequests\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 134\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 135\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mUser-Agent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mget_user_agent\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 136\u001b[39m \u001b[43m \u001b[49m\u001b[43mauth\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_auth\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 137\u001b[39m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43m(\u001b[49m\u001b[43mDEFAULT_CONNECT_TIMEOUT\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mDEFAULT_READ_TIMEOUT\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 138\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m response:\n\u001b[32m 139\u001b[39m kaggle_api_raise_for_status(response, resource_handle)\n\u001b[32m 140\u001b[39m \u001b[38;5;28mself\u001b[39m._check_for_version_update(response)\n",
78
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/api.py:73\u001b[39m, in \u001b[36mget\u001b[39m\u001b[34m(url, params, **kwargs)\u001b[39m\n\u001b[32m 62\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mget\u001b[39m(url, params=\u001b[38;5;28;01mNone\u001b[39;00m, **kwargs):\n\u001b[32m 63\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Sends a GET request.\u001b[39;00m\n\u001b[32m 64\u001b[39m \n\u001b[32m 65\u001b[39m \u001b[33;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 70\u001b[39m \u001b[33;03m :rtype: requests.Response\u001b[39;00m\n\u001b[32m 71\u001b[39m \u001b[33;03m \"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m73\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mget\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
79
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/api.py:59\u001b[39m, in \u001b[36mrequest\u001b[39m\u001b[34m(method, url, **kwargs)\u001b[39m\n\u001b[32m 55\u001b[39m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[32m 56\u001b[39m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[32m 57\u001b[39m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[32m 58\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m sessions.Session() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[32m---> \u001b[39m\u001b[32m59\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
80
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/sessions.py:589\u001b[39m, in \u001b[36mSession.request\u001b[39m\u001b[34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[39m\n\u001b[32m 584\u001b[39m send_kwargs = {\n\u001b[32m 585\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m\"\u001b[39m: timeout,\n\u001b[32m 586\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mallow_redirects\u001b[39m\u001b[33m\"\u001b[39m: allow_redirects,\n\u001b[32m 587\u001b[39m }\n\u001b[32m 588\u001b[39m send_kwargs.update(settings)\n\u001b[32m--> \u001b[39m\u001b[32m589\u001b[39m resp = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 591\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
81
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/sessions.py:703\u001b[39m, in \u001b[36mSession.send\u001b[39m\u001b[34m(self, request, **kwargs)\u001b[39m\n\u001b[32m 700\u001b[39m start = preferred_clock()\n\u001b[32m 702\u001b[39m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m703\u001b[39m r = \u001b[43madapter\u001b[49m\u001b[43m.\u001b[49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 705\u001b[39m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[32m 706\u001b[39m elapsed = preferred_clock() - start\n",
82
- "\u001b[36mFile \u001b[39m\u001b[32m/opt/anaconda3/envs/aws_conf/lib/python3.11/site-packages/requests/adapters.py:665\u001b[39m, in \u001b[36mHTTPAdapter.send\u001b[39m\u001b[34m(self, request, stream, timeout, verify, cert, proxies)\u001b[39m\n\u001b[32m 662\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e.reason, ConnectTimeoutError):\n\u001b[32m 663\u001b[39m \u001b[38;5;66;03m# TODO: Remove this in 3.0.0: see #2811\u001b[39;00m\n\u001b[32m 664\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e.reason, NewConnectionError):\n\u001b[32m--> \u001b[39m\u001b[32m665\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ConnectTimeout(e, request=request)\n\u001b[32m 667\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e.reason, ResponseError):\n\u001b[32m 668\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m RetryError(e, request=request)\n",
83
- "\u001b[31mConnectTimeout\u001b[39m: HTTPSConnectionPool(host='www.kaggle.com', port=443): Max retries exceeded with url: /api/v1/datasets/view/rickandjoe/electricity-transformer-dataset-etdataset (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x34a15e910>, 'Connection to www.kaggle.com timed out. (connect timeout=5)'))"
84
- ]
85
- }
86
- ],
87
  "source": [
88
  "path = kagglehub.dataset_download(\n",
89
  " \"rickandjoe/electricity-transformer-dataset-etdataset\"\n",
@@ -99,7 +55,7 @@
99
  },
100
  {
101
  "cell_type": "code",
102
- "execution_count": 198,
103
  "metadata": {},
104
  "outputs": [],
105
  "source": [
@@ -108,7 +64,7 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 199,
112
  "metadata": {},
113
  "outputs": [],
114
  "source": [
@@ -117,7 +73,7 @@
117
  },
118
  {
119
  "cell_type": "code",
120
- "execution_count": 200,
121
  "metadata": {},
122
  "outputs": [],
123
  "source": [
@@ -126,7 +82,7 @@
126
  },
127
  {
128
  "cell_type": "code",
129
- "execution_count": 201,
130
  "metadata": {},
131
  "outputs": [],
132
  "source": [
@@ -142,7 +98,7 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 202,
146
  "metadata": {},
147
  "outputs": [],
148
  "source": [
@@ -158,7 +114,7 @@
158
  },
159
  {
160
  "cell_type": "code",
161
- "execution_count": 203,
162
  "metadata": {},
163
  "outputs": [],
164
  "source": [
@@ -174,7 +130,7 @@
174
  },
175
  {
176
  "cell_type": "code",
177
- "execution_count": 204,
178
  "metadata": {},
179
  "outputs": [],
180
  "source": [
@@ -190,7 +146,7 @@
190
  },
191
  {
192
  "cell_type": "code",
193
- "execution_count": 205,
194
  "metadata": {},
195
  "outputs": [],
196
  "source": [
@@ -199,7 +155,7 @@
199
  },
200
  {
201
  "cell_type": "code",
202
- "execution_count": 206,
203
  "metadata": {},
204
  "outputs": [],
205
  "source": [
@@ -208,13 +164,142 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": 207,
212
  "metadata": {},
213
  "outputs": [],
214
  "source": [
215
  "df_final = df_final.sort_values([\"unique_id\", \"ds\"])"
216
  ]
217
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  {
219
  "cell_type": "markdown",
220
  "metadata": {},
@@ -231,7 +316,7 @@
231
  },
232
  {
233
  "cell_type": "code",
234
- "execution_count": 208,
235
  "metadata": {},
236
  "outputs": [],
237
  "source": [
@@ -247,7 +332,7 @@
247
  },
248
  {
249
  "cell_type": "code",
250
- "execution_count": 209,
251
  "metadata": {},
252
  "outputs": [],
253
  "source": [
@@ -278,7 +363,7 @@
278
  },
279
  {
280
  "cell_type": "code",
281
- "execution_count": 210,
282
  "metadata": {},
283
  "outputs": [],
284
  "source": [
@@ -309,7 +394,7 @@
309
  },
310
  {
311
  "cell_type": "code",
312
- "execution_count": 211,
313
  "metadata": {},
314
  "outputs": [],
315
  "source": [
@@ -346,7 +431,7 @@
346
  },
347
  {
348
  "cell_type": "code",
349
- "execution_count": 212,
350
  "metadata": {},
351
  "outputs": [],
352
  "source": [
@@ -399,7 +484,7 @@
399
  },
400
  {
401
  "cell_type": "code",
402
- "execution_count": 213,
403
  "metadata": {},
404
  "outputs": [],
405
  "source": [
@@ -408,7 +493,7 @@
408
  },
409
  {
410
  "cell_type": "code",
411
- "execution_count": 214,
412
  "metadata": {},
413
  "outputs": [],
414
  "source": [
@@ -417,7 +502,7 @@
417
  },
418
  {
419
  "cell_type": "code",
420
- "execution_count": 215,
421
  "metadata": {},
422
  "outputs": [],
423
  "source": [
@@ -426,7 +511,7 @@
426
  },
427
  {
428
  "cell_type": "code",
429
- "execution_count": 216,
430
  "metadata": {},
431
  "outputs": [],
432
  "source": [
@@ -449,7 +534,7 @@
449
  },
450
  {
451
  "cell_type": "code",
452
- "execution_count": 217,
453
  "metadata": {},
454
  "outputs": [],
455
  "source": [
@@ -458,7 +543,7 @@
458
  },
459
  {
460
  "cell_type": "code",
461
- "execution_count": 218,
462
  "metadata": {},
463
  "outputs": [],
464
  "source": [
@@ -467,7 +552,7 @@
467
  },
468
  {
469
  "cell_type": "code",
470
- "execution_count": 219,
471
  "metadata": {},
472
  "outputs": [],
473
  "source": [
@@ -483,7 +568,7 @@
483
  },
484
  {
485
  "cell_type": "code",
486
- "execution_count": 220,
487
  "metadata": {},
488
  "outputs": [],
489
  "source": [
@@ -501,7 +586,7 @@
501
  },
502
  {
503
  "cell_type": "code",
504
- "execution_count": 221,
505
  "metadata": {},
506
  "outputs": [
507
  {
@@ -511,7 +596,7 @@
511
  "<Figure size 1600x350 with 2 Axes>"
512
  ]
513
  },
514
- "execution_count": 221,
515
  "metadata": {},
516
  "output_type": "execute_result"
517
  }
@@ -536,7 +621,7 @@
536
  },
537
  {
538
  "cell_type": "code",
539
- "execution_count": 222,
540
  "metadata": {},
541
  "outputs": [],
542
  "source": [
@@ -556,7 +641,7 @@
556
  },
557
  {
558
  "cell_type": "code",
559
- "execution_count": 223,
560
  "metadata": {},
561
  "outputs": [],
562
  "source": [
@@ -565,7 +650,7 @@
565
  },
566
  {
567
  "cell_type": "code",
568
- "execution_count": 224,
569
  "metadata": {},
570
  "outputs": [],
571
  "source": [
@@ -578,7 +663,7 @@
578
  },
579
  {
580
  "cell_type": "code",
581
- "execution_count": 225,
582
  "metadata": {},
583
  "outputs": [
584
  {
@@ -625,7 +710,7 @@
625
  "0 9.148477 18.487053 27.713882 27.880458"
626
  ]
627
  },
628
- "execution_count": 225,
629
  "metadata": {},
630
  "output_type": "execute_result"
631
  }
 
37
  },
38
  {
39
  "cell_type": "code",
40
+ "execution_count": 2,
41
  "metadata": {},
42
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  "source": [
44
  "path = kagglehub.dataset_download(\n",
45
  " \"rickandjoe/electricity-transformer-dataset-etdataset\"\n",
 
55
  },
56
  {
57
  "cell_type": "code",
58
+ "execution_count": 3,
59
  "metadata": {},
60
  "outputs": [],
61
  "source": [
 
64
  },
65
  {
66
  "cell_type": "code",
67
+ "execution_count": 4,
68
  "metadata": {},
69
  "outputs": [],
70
  "source": [
 
73
  },
74
  {
75
  "cell_type": "code",
76
+ "execution_count": 5,
77
  "metadata": {},
78
  "outputs": [],
79
  "source": [
 
82
  },
83
  {
84
  "cell_type": "code",
85
+ "execution_count": 6,
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
 
98
  },
99
  {
100
  "cell_type": "code",
101
+ "execution_count": 7,
102
  "metadata": {},
103
  "outputs": [],
104
  "source": [
 
114
  },
115
  {
116
  "cell_type": "code",
117
+ "execution_count": 8,
118
  "metadata": {},
119
  "outputs": [],
120
  "source": [
 
130
  },
131
  {
132
  "cell_type": "code",
133
+ "execution_count": 9,
134
  "metadata": {},
135
  "outputs": [],
136
  "source": [
 
146
  },
147
  {
148
  "cell_type": "code",
149
+ "execution_count": 10,
150
  "metadata": {},
151
  "outputs": [],
152
  "source": [
 
155
  },
156
  {
157
  "cell_type": "code",
158
+ "execution_count": 11,
159
  "metadata": {},
160
  "outputs": [],
161
  "source": [
 
164
  },
165
  {
166
  "cell_type": "code",
167
+ "execution_count": 12,
168
  "metadata": {},
169
  "outputs": [],
170
  "source": [
171
  "df_final = df_final.sort_values([\"unique_id\", \"ds\"])"
172
  ]
173
  },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 33,
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "data": {
181
+ "text/html": [
182
+ "<div>\n",
183
+ "<style scoped>\n",
184
+ " .dataframe tbody tr th:only-of-type {\n",
185
+ " vertical-align: middle;\n",
186
+ " }\n",
187
+ "\n",
188
+ " .dataframe tbody tr th {\n",
189
+ " vertical-align: top;\n",
190
+ " }\n",
191
+ "\n",
192
+ " .dataframe thead th {\n",
193
+ " text-align: right;\n",
194
+ " }\n",
195
+ "</style>\n",
196
+ "<table border=\"1\" class=\"dataframe\">\n",
197
+ " <thead>\n",
198
+ " <tr style=\"text-align: right;\">\n",
199
+ " <th></th>\n",
200
+ " <th>ds</th>\n",
201
+ " <th>unique_id</th>\n",
202
+ " <th>y</th>\n",
203
+ " </tr>\n",
204
+ " </thead>\n",
205
+ " <tbody>\n",
206
+ " <tr>\n",
207
+ " <th>0</th>\n",
208
+ " <td>2016-07-01 00:00:00</td>\n",
209
+ " <td>transformer_1</td>\n",
210
+ " <td>30.531000</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>1</th>\n",
214
+ " <td>2016-07-01 01:00:00</td>\n",
215
+ " <td>transformer_1</td>\n",
216
+ " <td>27.787001</td>\n",
217
+ " </tr>\n",
218
+ " <tr>\n",
219
+ " <th>2</th>\n",
220
+ " <td>2016-07-01 02:00:00</td>\n",
221
+ " <td>transformer_1</td>\n",
222
+ " <td>27.787001</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>3</th>\n",
226
+ " <td>2016-07-01 03:00:00</td>\n",
227
+ " <td>transformer_1</td>\n",
228
+ " <td>25.044001</td>\n",
229
+ " </tr>\n",
230
+ " <tr>\n",
231
+ " <th>4</th>\n",
232
+ " <td>2016-07-01 04:00:00</td>\n",
233
+ " <td>transformer_1</td>\n",
234
+ " <td>21.948000</td>\n",
235
+ " </tr>\n",
236
+ " <tr>\n",
237
+ " <th>...</th>\n",
238
+ " <td>...</td>\n",
239
+ " <td>...</td>\n",
240
+ " <td>...</td>\n",
241
+ " </tr>\n",
242
+ " <tr>\n",
243
+ " <th>34835</th>\n",
244
+ " <td>2018-06-26 15:00:00</td>\n",
245
+ " <td>transformer_2</td>\n",
246
+ " <td>47.084999</td>\n",
247
+ " </tr>\n",
248
+ " <tr>\n",
249
+ " <th>34836</th>\n",
250
+ " <td>2018-06-26 16:00:00</td>\n",
251
+ " <td>transformer_2</td>\n",
252
+ " <td>48.183498</td>\n",
253
+ " </tr>\n",
254
+ " <tr>\n",
255
+ " <th>34837</th>\n",
256
+ " <td>2018-06-26 17:00:00</td>\n",
257
+ " <td>transformer_2</td>\n",
258
+ " <td>48.183498</td>\n",
259
+ " </tr>\n",
260
+ " <tr>\n",
261
+ " <th>34838</th>\n",
262
+ " <td>2018-06-26 18:00:00</td>\n",
263
+ " <td>transformer_2</td>\n",
264
+ " <td>46.865501</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>34839</th>\n",
268
+ " <td>2018-06-26 19:00:00</td>\n",
269
+ " <td>transformer_2</td>\n",
270
+ " <td>45.986500</td>\n",
271
+ " </tr>\n",
272
+ " </tbody>\n",
273
+ "</table>\n",
274
+ "<p>34840 rows × 3 columns</p>\n",
275
+ "</div>"
276
+ ],
277
+ "text/plain": [
278
+ " ds unique_id y\n",
279
+ "0 2016-07-01 00:00:00 transformer_1 30.531000\n",
280
+ "1 2016-07-01 01:00:00 transformer_1 27.787001\n",
281
+ "2 2016-07-01 02:00:00 transformer_1 27.787001\n",
282
+ "3 2016-07-01 03:00:00 transformer_1 25.044001\n",
283
+ "4 2016-07-01 04:00:00 transformer_1 21.948000\n",
284
+ "... ... ... ...\n",
285
+ "34835 2018-06-26 15:00:00 transformer_2 47.084999\n",
286
+ "34836 2018-06-26 16:00:00 transformer_2 48.183498\n",
287
+ "34837 2018-06-26 17:00:00 transformer_2 48.183498\n",
288
+ "34838 2018-06-26 18:00:00 transformer_2 46.865501\n",
289
+ "34839 2018-06-26 19:00:00 transformer_2 45.986500\n",
290
+ "\n",
291
+ "[34840 rows x 3 columns]"
292
+ ]
293
+ },
294
+ "execution_count": 33,
295
+ "metadata": {},
296
+ "output_type": "execute_result"
297
+ }
298
+ ],
299
+ "source": [
300
+ "df_final"
301
+ ]
302
+ },
303
  {
304
  "cell_type": "markdown",
305
  "metadata": {},
 
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": 13,
320
  "metadata": {},
321
  "outputs": [],
322
  "source": [
 
332
  },
333
  {
334
  "cell_type": "code",
335
+ "execution_count": 14,
336
  "metadata": {},
337
  "outputs": [],
338
  "source": [
 
363
  },
364
  {
365
  "cell_type": "code",
366
+ "execution_count": 15,
367
  "metadata": {},
368
  "outputs": [],
369
  "source": [
 
394
  },
395
  {
396
  "cell_type": "code",
397
+ "execution_count": 16,
398
  "metadata": {},
399
  "outputs": [],
400
  "source": [
 
431
  },
432
  {
433
  "cell_type": "code",
434
+ "execution_count": 17,
435
  "metadata": {},
436
  "outputs": [],
437
  "source": [
 
484
  },
485
  {
486
  "cell_type": "code",
487
+ "execution_count": 18,
488
  "metadata": {},
489
  "outputs": [],
490
  "source": [
 
493
  },
494
  {
495
  "cell_type": "code",
496
+ "execution_count": 19,
497
  "metadata": {},
498
  "outputs": [],
499
  "source": [
 
502
  },
503
  {
504
  "cell_type": "code",
505
+ "execution_count": 20,
506
  "metadata": {},
507
  "outputs": [],
508
  "source": [
 
511
  },
512
  {
513
  "cell_type": "code",
514
+ "execution_count": 21,
515
  "metadata": {},
516
  "outputs": [],
517
  "source": [
 
534
  },
535
  {
536
  "cell_type": "code",
537
+ "execution_count": 22,
538
  "metadata": {},
539
  "outputs": [],
540
  "source": [
 
543
  },
544
  {
545
  "cell_type": "code",
546
+ "execution_count": 23,
547
  "metadata": {},
548
  "outputs": [],
549
  "source": [
 
552
  },
553
  {
554
  "cell_type": "code",
555
+ "execution_count": 24,
556
  "metadata": {},
557
  "outputs": [],
558
  "source": [
 
568
  },
569
  {
570
  "cell_type": "code",
571
+ "execution_count": 25,
572
  "metadata": {},
573
  "outputs": [],
574
  "source": [
 
586
  },
587
  {
588
  "cell_type": "code",
589
+ "execution_count": 26,
590
  "metadata": {},
591
  "outputs": [
592
  {
 
596
  "<Figure size 1600x350 with 2 Axes>"
597
  ]
598
  },
599
+ "execution_count": 26,
600
  "metadata": {},
601
  "output_type": "execute_result"
602
  }
 
621
  },
622
  {
623
  "cell_type": "code",
624
+ "execution_count": 27,
625
  "metadata": {},
626
  "outputs": [],
627
  "source": [
 
641
  },
642
  {
643
  "cell_type": "code",
644
+ "execution_count": 28,
645
  "metadata": {},
646
  "outputs": [],
647
  "source": [
 
650
  },
651
  {
652
  "cell_type": "code",
653
+ "execution_count": 29,
654
  "metadata": {},
655
  "outputs": [],
656
  "source": [
 
663
  },
664
  {
665
  "cell_type": "code",
666
+ "execution_count": 30,
667
  "metadata": {},
668
  "outputs": [
669
  {
 
710
  "0 9.148477 18.487053 27.713882 27.880458"
711
  ]
712
  },
713
+ "execution_count": 30,
714
  "metadata": {},
715
  "output_type": "execute_result"
716
  }
notebooks/02_data_wrangling.ipynb ADDED
@@ -0,0 +1,773 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import numpy as np"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "df_lluvia = pd.read_excel(\n",
20
+ " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/CO.xlsx\",\n",
21
+ " skiprows=0,\n",
22
+ ")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "metadata": {},
28
+ "source": [
29
+ "Select useful raws"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 3,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "df_lluvia = df_lluvia[1:]"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "markdown",
43
+ "metadata": {},
44
+ "source": [
45
+ "Rename the dates"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 4,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "df_lluvia = df_lluvia.rename(columns={\"Unnamed: 0\": \"ds\"})"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 5,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "df_lluvia[\"ds\"] = pd.to_datetime(df_lluvia[\"ds\"])"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 6,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "df_lluvia_melted = pd.melt(\n",
73
+ " df_lluvia, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\"\n",
74
+ ")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 7,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "df_lluvia_melted = df_lluvia_melted.dropna()"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "Seleccionar los lugares disponibles para la lluvia"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 8,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "places_lluvia = df_lluvia_melted[\"station\"].unique()"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 9,
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "name": "stdout",
109
+ "output_type": "stream",
110
+ "text": [
111
+ "['BELISARIO' 'CARAPUNGO' 'CENTRO' 'COTOCOLLAO' 'EL CAMAL' 'GUAMANI'\n",
112
+ " 'LOS CHILLOS' 'TUMBACO' 'CONDADO' 'TURUBAMBA']\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "print(places_lluvia)"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "markdown",
122
+ "metadata": {},
123
+ "source": [
124
+ "See the min and max dates for the lluvia places"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 10,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "stats_lluvia = df_lluvia_melted.groupby(\"station\").agg({\"ds\": [\"min\", \"max\"]})"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 11,
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "data": {
143
+ "text/html": [
144
+ "<div>\n",
145
+ "<style scoped>\n",
146
+ " .dataframe tbody tr th:only-of-type {\n",
147
+ " vertical-align: middle;\n",
148
+ " }\n",
149
+ "\n",
150
+ " .dataframe tbody tr th {\n",
151
+ " vertical-align: top;\n",
152
+ " }\n",
153
+ "\n",
154
+ " .dataframe thead tr th {\n",
155
+ " text-align: left;\n",
156
+ " }\n",
157
+ "\n",
158
+ " .dataframe thead tr:last-of-type th {\n",
159
+ " text-align: right;\n",
160
+ " }\n",
161
+ "</style>\n",
162
+ "<table border=\"1\" class=\"dataframe\">\n",
163
+ " <thead>\n",
164
+ " <tr>\n",
165
+ " <th></th>\n",
166
+ " <th colspan=\"2\" halign=\"left\">ds</th>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th></th>\n",
170
+ " <th>min</th>\n",
171
+ " <th>max</th>\n",
172
+ " </tr>\n",
173
+ " <tr>\n",
174
+ " <th>station</th>\n",
175
+ " <th></th>\n",
176
+ " <th></th>\n",
177
+ " </tr>\n",
178
+ " </thead>\n",
179
+ " <tbody>\n",
180
+ " <tr>\n",
181
+ " <th>BELISARIO</th>\n",
182
+ " <td>2004-01-01 00:00:00</td>\n",
183
+ " <td>2025-09-30 23:00:00</td>\n",
184
+ " </tr>\n",
185
+ " <tr>\n",
186
+ " <th>CARAPUNGO</th>\n",
187
+ " <td>2005-03-16 15:00:00</td>\n",
188
+ " <td>2025-09-30 23:00:00</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>CENTRO</th>\n",
192
+ " <td>2004-01-01 00:00:00</td>\n",
193
+ " <td>2025-09-30 23:00:00</td>\n",
194
+ " </tr>\n",
195
+ " <tr>\n",
196
+ " <th>CONDADO</th>\n",
197
+ " <td>2004-01-01 00:00:00</td>\n",
198
+ " <td>2005-02-21 09:00:00</td>\n",
199
+ " </tr>\n",
200
+ " <tr>\n",
201
+ " <th>COTOCOLLAO</th>\n",
202
+ " <td>2005-02-25 14:00:00</td>\n",
203
+ " <td>2025-09-30 23:00:00</td>\n",
204
+ " </tr>\n",
205
+ " <tr>\n",
206
+ " <th>EL CAMAL</th>\n",
207
+ " <td>2004-01-01 00:00:00</td>\n",
208
+ " <td>2025-09-30 23:00:00</td>\n",
209
+ " </tr>\n",
210
+ " <tr>\n",
211
+ " <th>GUAMANI</th>\n",
212
+ " <td>2005-04-19 15:00:00</td>\n",
213
+ " <td>2025-06-18 08:00:00</td>\n",
214
+ " </tr>\n",
215
+ " <tr>\n",
216
+ " <th>LOS CHILLOS</th>\n",
217
+ " <td>2014-01-21 00:00:00</td>\n",
218
+ " <td>2025-09-30 12:00:00</td>\n",
219
+ " </tr>\n",
220
+ " <tr>\n",
221
+ " <th>TUMBACO</th>\n",
222
+ " <td>2019-06-10 17:00:00</td>\n",
223
+ " <td>2025-09-28 07:00:00</td>\n",
224
+ " </tr>\n",
225
+ " <tr>\n",
226
+ " <th>TURUBAMBA</th>\n",
227
+ " <td>2004-01-01 00:00:00</td>\n",
228
+ " <td>2005-03-08 09:00:00</td>\n",
229
+ " </tr>\n",
230
+ " </tbody>\n",
231
+ "</table>\n",
232
+ "</div>"
233
+ ],
234
+ "text/plain": [
235
+ " ds \n",
236
+ " min max\n",
237
+ "station \n",
238
+ "BELISARIO 2004-01-01 00:00:00 2025-09-30 23:00:00\n",
239
+ "CARAPUNGO 2005-03-16 15:00:00 2025-09-30 23:00:00\n",
240
+ "CENTRO 2004-01-01 00:00:00 2025-09-30 23:00:00\n",
241
+ "CONDADO 2004-01-01 00:00:00 2005-02-21 09:00:00\n",
242
+ "COTOCOLLAO 2005-02-25 14:00:00 2025-09-30 23:00:00\n",
243
+ "EL CAMAL 2004-01-01 00:00:00 2025-09-30 23:00:00\n",
244
+ "GUAMANI 2005-04-19 15:00:00 2025-06-18 08:00:00\n",
245
+ "LOS CHILLOS 2014-01-21 00:00:00 2025-09-30 12:00:00\n",
246
+ "TUMBACO 2019-06-10 17:00:00 2025-09-28 07:00:00\n",
247
+ "TURUBAMBA 2004-01-01 00:00:00 2005-03-08 09:00:00"
248
+ ]
249
+ },
250
+ "execution_count": 11,
251
+ "metadata": {},
252
+ "output_type": "execute_result"
253
+ }
254
+ ],
255
+ "source": [
256
+ "stats_lluvia"
257
+ ]
258
+ },
259
+ {
260
+ "cell_type": "code",
261
+ "execution_count": 12,
262
+ "metadata": {},
263
+ "outputs": [],
264
+ "source": [
265
+ "useful_places_lluvia = stats_lluvia[\n",
266
+ " stats_lluvia[(\"ds\", \"max\")] >= \"2025-09-28 07:00:00\"\n",
267
+ "].index"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 13,
273
+ "metadata": {},
274
+ "outputs": [],
275
+ "source": [
276
+ "df_lluvia_melted = df_lluvia_melted[\n",
277
+ " df_lluvia_melted[\"station\"].isin(useful_places_lluvia)\n",
278
+ "]"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 14,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "df_lluvia_melted[\"y\"] = df_lluvia_melted[\"y\"].apply(\n",
288
+ " lambda x: np.nan if x == \" \" else x\n",
289
+ ")"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 15,
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": [
298
+ "df_lluvia_melted[\"property\"] = \"co\""
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "markdown",
303
+ "metadata": {},
304
+ "source": [
305
+ "## PM 2.5"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "markdown",
310
+ "metadata": {},
311
+ "source": [
312
+ "Read the pm2.5 dataframe"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 16,
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "df_pm = pd.read_excel(\n",
322
+ " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/PM2.5.xlsx\"\n",
323
+ ")"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 17,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "df_pm = df_pm[1:]"
333
+ ]
334
+ },
335
+ {
336
+ "cell_type": "code",
337
+ "execution_count": 18,
338
+ "metadata": {},
339
+ "outputs": [],
340
+ "source": [
341
+ "df_pm = df_pm.rename(columns={\"Unnamed: 0\": \"ds\"})"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": 19,
347
+ "metadata": {},
348
+ "outputs": [],
349
+ "source": [
350
+ "df_pm_melted = pd.melt(df_pm, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\")"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": 20,
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "df_pm_melted[\"y\"] = df_pm_melted[\"y\"].apply(\n",
360
+ " lambda x: np.nan if x == \" \" else x\n",
361
+ ")"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 21,
367
+ "metadata": {},
368
+ "outputs": [],
369
+ "source": [
370
+ "df_pm_melted = df_pm_melted.dropna()"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 22,
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "places_pm = df_pm_melted[\"station\"].unique()"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": 23,
385
+ "metadata": {},
386
+ "outputs": [
387
+ {
388
+ "name": "stdout",
389
+ "output_type": "stream",
390
+ "text": [
391
+ "['BELISARIO' 'CARAPUNGO' 'CENTRO' 'COTOCOLLAO' 'EL CAMAL' 'GUAMANI'\n",
392
+ " 'LOS CHILLOS' 'SAN ANTONIO' 'TUMBACO' 'TURUBAMBA']\n"
393
+ ]
394
+ }
395
+ ],
396
+ "source": [
397
+ "print(places_pm)"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 24,
403
+ "metadata": {},
404
+ "outputs": [],
405
+ "source": [
406
+ "df_pm_melted[\"ds\"] = pd.to_datetime(df_pm_melted[\"ds\"])"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 25,
412
+ "metadata": {},
413
+ "outputs": [],
414
+ "source": [
415
+ "metric_dates = df_pm_melted.groupby([\"station\"]).agg({\"ds\": [\"min\", \"max\"]})"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 26,
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "data": {
425
+ "text/html": [
426
+ "<div>\n",
427
+ "<style scoped>\n",
428
+ " .dataframe tbody tr th:only-of-type {\n",
429
+ " vertical-align: middle;\n",
430
+ " }\n",
431
+ "\n",
432
+ " .dataframe tbody tr th {\n",
433
+ " vertical-align: top;\n",
434
+ " }\n",
435
+ "\n",
436
+ " .dataframe thead tr th {\n",
437
+ " text-align: left;\n",
438
+ " }\n",
439
+ "\n",
440
+ " .dataframe thead tr:last-of-type th {\n",
441
+ " text-align: right;\n",
442
+ " }\n",
443
+ "</style>\n",
444
+ "<table border=\"1\" class=\"dataframe\">\n",
445
+ " <thead>\n",
446
+ " <tr>\n",
447
+ " <th></th>\n",
448
+ " <th colspan=\"2\" halign=\"left\">ds</th>\n",
449
+ " </tr>\n",
450
+ " <tr>\n",
451
+ " <th></th>\n",
452
+ " <th>min</th>\n",
453
+ " <th>max</th>\n",
454
+ " </tr>\n",
455
+ " <tr>\n",
456
+ " <th>station</th>\n",
457
+ " <th></th>\n",
458
+ " <th></th>\n",
459
+ " </tr>\n",
460
+ " </thead>\n",
461
+ " <tbody>\n",
462
+ " <tr>\n",
463
+ " <th>BELISARIO</th>\n",
464
+ " <td>2004-09-03 17:00:00</td>\n",
465
+ " <td>2025-08-31 23:00:00</td>\n",
466
+ " </tr>\n",
467
+ " <tr>\n",
468
+ " <th>CARAPUNGO</th>\n",
469
+ " <td>2005-03-16 00:00:00</td>\n",
470
+ " <td>2025-08-31 23:00:00</td>\n",
471
+ " </tr>\n",
472
+ " <tr>\n",
473
+ " <th>CENTRO</th>\n",
474
+ " <td>2004-08-26 15:00:00</td>\n",
475
+ " <td>2025-08-31 23:00:00</td>\n",
476
+ " </tr>\n",
477
+ " <tr>\n",
478
+ " <th>COTOCOLLAO</th>\n",
479
+ " <td>2005-02-25 10:00:00</td>\n",
480
+ " <td>2025-08-31 23:00:00</td>\n",
481
+ " </tr>\n",
482
+ " <tr>\n",
483
+ " <th>EL CAMAL</th>\n",
484
+ " <td>2004-08-26 17:00:00</td>\n",
485
+ " <td>2025-08-31 23:00:00</td>\n",
486
+ " </tr>\n",
487
+ " <tr>\n",
488
+ " <th>GUAMANI</th>\n",
489
+ " <td>2013-10-28 00:00:00</td>\n",
490
+ " <td>2025-06-18 08:00:00</td>\n",
491
+ " </tr>\n",
492
+ " <tr>\n",
493
+ " <th>LOS CHILLOS</th>\n",
494
+ " <td>2014-01-21 00:00:00</td>\n",
495
+ " <td>2025-03-31 14:00:00</td>\n",
496
+ " </tr>\n",
497
+ " <tr>\n",
498
+ " <th>SAN ANTONIO</th>\n",
499
+ " <td>2017-03-29 00:00:00</td>\n",
500
+ " <td>2025-08-31 23:00:00</td>\n",
501
+ " </tr>\n",
502
+ " <tr>\n",
503
+ " <th>TUMBACO</th>\n",
504
+ " <td>2017-03-07 13:00:00</td>\n",
505
+ " <td>2025-08-31 23:00:00</td>\n",
506
+ " </tr>\n",
507
+ " <tr>\n",
508
+ " <th>TURUBAMBA</th>\n",
509
+ " <td>2004-10-04 17:00:00</td>\n",
510
+ " <td>2005-03-08 09:00:00</td>\n",
511
+ " </tr>\n",
512
+ " </tbody>\n",
513
+ "</table>\n",
514
+ "</div>"
515
+ ],
516
+ "text/plain": [
517
+ " ds \n",
518
+ " min max\n",
519
+ "station \n",
520
+ "BELISARIO 2004-09-03 17:00:00 2025-08-31 23:00:00\n",
521
+ "CARAPUNGO 2005-03-16 00:00:00 2025-08-31 23:00:00\n",
522
+ "CENTRO 2004-08-26 15:00:00 2025-08-31 23:00:00\n",
523
+ "COTOCOLLAO 2005-02-25 10:00:00 2025-08-31 23:00:00\n",
524
+ "EL CAMAL 2004-08-26 17:00:00 2025-08-31 23:00:00\n",
525
+ "GUAMANI 2013-10-28 00:00:00 2025-06-18 08:00:00\n",
526
+ "LOS CHILLOS 2014-01-21 00:00:00 2025-03-31 14:00:00\n",
527
+ "SAN ANTONIO 2017-03-29 00:00:00 2025-08-31 23:00:00\n",
528
+ "TUMBACO 2017-03-07 13:00:00 2025-08-31 23:00:00\n",
529
+ "TURUBAMBA 2004-10-04 17:00:00 2005-03-08 09:00:00"
530
+ ]
531
+ },
532
+ "execution_count": 26,
533
+ "metadata": {},
534
+ "output_type": "execute_result"
535
+ }
536
+ ],
537
+ "source": [
538
+ "metric_dates"
539
+ ]
540
+ },
541
+ {
542
+ "cell_type": "code",
543
+ "execution_count": 27,
544
+ "metadata": {},
545
+ "outputs": [],
546
+ "source": [
547
+ "useful_places_pm = metric_dates[\n",
548
+ " metric_dates[(\"ds\", \"max\")] == \"2025-08-31 23:00:00\"\n",
549
+ "].index"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "code",
554
+ "execution_count": 28,
555
+ "metadata": {},
556
+ "outputs": [],
557
+ "source": [
558
+ "df_pm_melted = df_pm_melted[df_pm_melted[\"station\"].isin(useful_places_pm)]"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": 29,
564
+ "metadata": {},
565
+ "outputs": [],
566
+ "source": [
567
+ "df_pm_melted[\"property\"] = \"pm-2.5\""
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "markdown",
572
+ "metadata": {},
573
+ "source": [
574
+ "## Temperature"
575
+ ]
576
+ },
577
+ {
578
+ "cell_type": "code",
579
+ "execution_count": 30,
580
+ "metadata": {},
581
+ "outputs": [],
582
+ "source": [
583
+ "df_temp = pd.read_excel(\n",
584
+ " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/TMP.xlsx\"\n",
585
+ ")"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 31,
591
+ "metadata": {},
592
+ "outputs": [],
593
+ "source": [
594
+ "df_temp = df_temp[1:]"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": 32,
600
+ "metadata": {},
601
+ "outputs": [],
602
+ "source": [
603
+ "df_temp = df_temp.rename(columns={\"Unnamed: 0\": \"ds\"})"
604
+ ]
605
+ },
606
+ {
607
+ "cell_type": "code",
608
+ "execution_count": 33,
609
+ "metadata": {},
610
+ "outputs": [],
611
+ "source": [
612
+ "df_temp_melted = pd.melt(df_temp, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\")"
613
+ ]
614
+ },
615
+ {
616
+ "cell_type": "code",
617
+ "execution_count": 34,
618
+ "metadata": {},
619
+ "outputs": [],
620
+ "source": [
621
+ "df_temp_melted = df_temp_melted.dropna()"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": 35,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": [
630
+ "df_temp_melted[\"ds\"] = pd.to_datetime(df_temp_melted[\"ds\"])"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "code",
635
+ "execution_count": 36,
636
+ "metadata": {},
637
+ "outputs": [],
638
+ "source": [
639
+ "metrics_temp = df_temp_melted.groupby([\"station\"]).agg({\"ds\": [\"min\", \"max\"]})"
640
+ ]
641
+ },
642
+ {
643
+ "cell_type": "code",
644
+ "execution_count": 37,
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "useful_places_temp = metrics_temp[\n",
649
+ " metrics_temp[(\"ds\", \"max\")] == pd.to_datetime(\"2025-09-30 23:00:00\")\n",
650
+ "].index"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "execution_count": 38,
656
+ "metadata": {},
657
+ "outputs": [],
658
+ "source": [
659
+ "df_temp_melted = df_temp_melted[df_temp_melted[\"station\"].isin(useful_places_temp)]"
660
+ ]
661
+ },
662
+ {
663
+ "cell_type": "code",
664
+ "execution_count": 39,
665
+ "metadata": {},
666
+ "outputs": [],
667
+ "source": [
668
+ "df_temp_melted[\"property\"] = \"temperature\""
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "markdown",
673
+ "metadata": {},
674
+ "source": [
675
+ "Concat to have a single dataframe"
676
+ ]
677
+ },
678
+ {
679
+ "cell_type": "code",
680
+ "execution_count": 40,
681
+ "metadata": {},
682
+ "outputs": [],
683
+ "source": [
684
+ "df_final = pd.concat([df_lluvia_melted, df_pm_melted, df_temp_melted])"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 41,
690
+ "metadata": {},
691
+ "outputs": [],
692
+ "source": [
693
+ "df_final[\"station\"] = df_final[\"station\"].apply(lambda x: x.upper())"
694
+ ]
695
+ },
696
+ {
697
+ "cell_type": "code",
698
+ "execution_count": 42,
699
+ "metadata": {},
700
+ "outputs": [],
701
+ "source": [
702
+ "df_final = df_final[df_final[\"ds\"] <= pd.to_datetime(\"2025-08-31 23:00:00\")]"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": 43,
708
+ "metadata": {},
709
+ "outputs": [],
710
+ "source": [
711
+ "df_final[\"station\"] = df_final[\"station\"].apply(\n",
712
+ " lambda x: \"SAN ANTONIO\" if x == \"SANANTONIO\" else x\n",
713
+ ")\n",
714
+ "\n",
715
+ "df_final[\"station\"] = df_final[\"station\"].apply(\n",
716
+ " lambda x: \"EL CAMAL\" if x == \"ELCAMAL\" else x\n",
717
+ ")"
718
+ ]
719
+ },
720
+ {
721
+ "cell_type": "code",
722
+ "execution_count": 44,
723
+ "metadata": {},
724
+ "outputs": [
725
+ {
726
+ "data": {
727
+ "text/plain": [
728
+ "array(['BELISARIO', 'CARAPUNGO', 'CENTRO', 'COTOCOLLAO', 'EL CAMAL',\n",
729
+ " 'LOS CHILLOS', 'TUMBACO', 'SAN ANTONIO'], dtype=object)"
730
+ ]
731
+ },
732
+ "execution_count": 44,
733
+ "metadata": {},
734
+ "output_type": "execute_result"
735
+ }
736
+ ],
737
+ "source": [
738
+ "df_final[\"station\"].unique()"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 45,
744
+ "metadata": {},
745
+ "outputs": [],
746
+ "source": [
747
+ "FINAL_PATH = \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data\"\n",
748
+ "df_final.to_parquet(f\"{FINAL_PATH}/datos_ambiente_quito.parquet\")"
749
+ ]
750
+ }
751
+ ],
752
+ "metadata": {
753
+ "kernelspec": {
754
+ "display_name": "aws_conf",
755
+ "language": "python",
756
+ "name": "python3"
757
+ },
758
+ "language_info": {
759
+ "codemirror_mode": {
760
+ "name": "ipython",
761
+ "version": 3
762
+ },
763
+ "file_extension": ".py",
764
+ "mimetype": "text/x-python",
765
+ "name": "python",
766
+ "nbconvert_exporter": "python",
767
+ "pygments_lexer": "ipython3",
768
+ "version": "3.11.13"
769
+ }
770
+ },
771
+ "nbformat": 4,
772
+ "nbformat_minor": 2
773
+ }
notebooks/03_data_verification.ipynb ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "df = pd.read_parquet(\n",
19
+ " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/datos_ambiente_quito.parquet\"\n",
20
+ ")"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 4,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/html": [
31
+ "<div>\n",
32
+ "<style scoped>\n",
33
+ " .dataframe tbody tr th:only-of-type {\n",
34
+ " vertical-align: middle;\n",
35
+ " }\n",
36
+ "\n",
37
+ " .dataframe tbody tr th {\n",
38
+ " vertical-align: top;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe thead th {\n",
42
+ " text-align: right;\n",
43
+ " }\n",
44
+ "</style>\n",
45
+ "<table border=\"1\" class=\"dataframe\">\n",
46
+ " <thead>\n",
47
+ " <tr style=\"text-align: right;\">\n",
48
+ " <th></th>\n",
49
+ " <th>ds</th>\n",
50
+ " <th>station</th>\n",
51
+ " <th>y</th>\n",
52
+ " <th>property</th>\n",
53
+ " </tr>\n",
54
+ " </thead>\n",
55
+ " <tbody>\n",
56
+ " <tr>\n",
57
+ " <th>0</th>\n",
58
+ " <td>2004-01-01 00:00:00</td>\n",
59
+ " <td>BELISARIO</td>\n",
60
+ " <td>7.42</td>\n",
61
+ " <td>co</td>\n",
62
+ " </tr>\n",
63
+ " <tr>\n",
64
+ " <th>1</th>\n",
65
+ " <td>2004-01-01 01:00:00</td>\n",
66
+ " <td>BELISARIO</td>\n",
67
+ " <td>7.96</td>\n",
68
+ " <td>co</td>\n",
69
+ " </tr>\n",
70
+ " <tr>\n",
71
+ " <th>2</th>\n",
72
+ " <td>2004-01-01 02:00:00</td>\n",
73
+ " <td>BELISARIO</td>\n",
74
+ " <td>8.42</td>\n",
75
+ " <td>co</td>\n",
76
+ " </tr>\n",
77
+ " <tr>\n",
78
+ " <th>3</th>\n",
79
+ " <td>2004-01-01 03:00:00</td>\n",
80
+ " <td>BELISARIO</td>\n",
81
+ " <td>9.06</td>\n",
82
+ " <td>co</td>\n",
83
+ " </tr>\n",
84
+ " <tr>\n",
85
+ " <th>4</th>\n",
86
+ " <td>2004-01-01 04:00:00</td>\n",
87
+ " <td>BELISARIO</td>\n",
88
+ " <td>6.57</td>\n",
89
+ " <td>co</td>\n",
90
+ " </tr>\n",
91
+ " <tr>\n",
92
+ " <th>...</th>\n",
93
+ " <td>...</td>\n",
94
+ " <td>...</td>\n",
95
+ " <td>...</td>\n",
96
+ " <td>...</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>1524523</th>\n",
100
+ " <td>2025-08-31 19:00:00</td>\n",
101
+ " <td>SAN ANTONIO</td>\n",
102
+ " <td>12.78</td>\n",
103
+ " <td>Temperature</td>\n",
104
+ " </tr>\n",
105
+ " <tr>\n",
106
+ " <th>1524524</th>\n",
107
+ " <td>2025-08-31 20:00:00</td>\n",
108
+ " <td>SAN ANTONIO</td>\n",
109
+ " <td>12.28</td>\n",
110
+ " <td>Temperature</td>\n",
111
+ " </tr>\n",
112
+ " <tr>\n",
113
+ " <th>1524525</th>\n",
114
+ " <td>2025-08-31 21:00:00</td>\n",
115
+ " <td>SAN ANTONIO</td>\n",
116
+ " <td>12.21</td>\n",
117
+ " <td>Temperature</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>1524526</th>\n",
121
+ " <td>2025-08-31 22:00:00</td>\n",
122
+ " <td>SAN ANTONIO</td>\n",
123
+ " <td>12.35</td>\n",
124
+ " <td>Temperature</td>\n",
125
+ " </tr>\n",
126
+ " <tr>\n",
127
+ " <th>1524527</th>\n",
128
+ " <td>2025-08-31 23:00:00</td>\n",
129
+ " <td>SAN ANTONIO</td>\n",
130
+ " <td>12.22</td>\n",
131
+ " <td>Temperature</td>\n",
132
+ " </tr>\n",
133
+ " </tbody>\n",
134
+ "</table>\n",
135
+ "<p>2828207 rows × 4 columns</p>\n",
136
+ "</div>"
137
+ ],
138
+ "text/plain": [
139
+ " ds station y property\n",
140
+ "0 2004-01-01 00:00:00 BELISARIO 7.42 co\n",
141
+ "1 2004-01-01 01:00:00 BELISARIO 7.96 co\n",
142
+ "2 2004-01-01 02:00:00 BELISARIO 8.42 co\n",
143
+ "3 2004-01-01 03:00:00 BELISARIO 9.06 co\n",
144
+ "4 2004-01-01 04:00:00 BELISARIO 6.57 co\n",
145
+ "... ... ... ... ...\n",
146
+ "1524523 2025-08-31 19:00:00 SAN ANTONIO 12.78 Temperature\n",
147
+ "1524524 2025-08-31 20:00:00 SAN ANTONIO 12.28 Temperature\n",
148
+ "1524525 2025-08-31 21:00:00 SAN ANTONIO 12.21 Temperature\n",
149
+ "1524526 2025-08-31 22:00:00 SAN ANTONIO 12.35 Temperature\n",
150
+ "1524527 2025-08-31 23:00:00 SAN ANTONIO 12.22 Temperature\n",
151
+ "\n",
152
+ "[2828207 rows x 4 columns]"
153
+ ]
154
+ },
155
+ "execution_count": 4,
156
+ "metadata": {},
157
+ "output_type": "execute_result"
158
+ }
159
+ ],
160
+ "source": [
161
+ "df"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": []
170
+ }
171
+ ],
172
+ "metadata": {
173
+ "kernelspec": {
174
+ "display_name": "aws_conf",
175
+ "language": "python",
176
+ "name": "python3"
177
+ },
178
+ "language_info": {
179
+ "codemirror_mode": {
180
+ "name": "ipython",
181
+ "version": 3
182
+ },
183
+ "file_extension": ".py",
184
+ "mimetype": "text/x-python",
185
+ "name": "python",
186
+ "nbconvert_exporter": "python",
187
+ "pygments_lexer": "ipython3",
188
+ "version": "3.11.13"
189
+ }
190
+ },
191
+ "nbformat": 4,
192
+ "nbformat_minor": 2
193
+ }
src/chronos_conference/adapters/filter_ts.py CHANGED
@@ -2,9 +2,26 @@ import pandas as pd
2
 
3
 
4
  def filter_ts(
5
- df: pd.DataFrame, date_col: str, min_date: str, max_date: str
 
 
 
 
 
 
 
6
  ) -> pd.DataFrame:
7
  df = df.copy()
8
  if pd.api.types.is_datetime64_any_dtype(df[date_col]) is False:
9
  df[date_col] = pd.to_datetime(df[date_col])
10
- return df[(df[date_col] >= min_date) & (df[date_col] <= max_date)]
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def filter_ts(
5
+ df: pd.DataFrame,
6
+ date_col: str,
7
+ min_date: str,
8
+ max_date: str,
9
+ city_col: str,
10
+ city_choice: list,
11
+ property_col: str,
12
+ property_choice: list,
13
  ) -> pd.DataFrame:
14
  df = df.copy()
15
  if pd.api.types.is_datetime64_any_dtype(df[date_col]) is False:
16
  df[date_col] = pd.to_datetime(df[date_col])
17
+ df = df[(df[date_col] >= min_date) & (df[date_col] <= max_date)]
18
+ if isinstance(property_choice, str):
19
+ property_choice = [property_choice]
20
+ df_final = df[
21
+ (df[city_col] == city_choice) & (df[property_col].isin(property_choice))
22
+ ]
23
+ return df_final
24
+
25
+
26
+ def get_properties(df: pd.DataFrame, city_col: str, property_given: str) -> list:
27
+ return df[df[city_col] == property_given]["property"].unique().tolist()
src/chronos_conference/adapters/model_instance.py CHANGED
@@ -45,7 +45,6 @@ class ChronosForecaster(ForecastingBaseModel):
45
  results = results.rename(
46
  columns={
47
  "mean": "AWSChronosForecast",
48
- "item_id": "unique_id",
49
  "timestamp": "ds",
50
  }
51
  )
 
45
  results = results.rename(
46
  columns={
47
  "mean": "AWSChronosForecast",
 
48
  "timestamp": "ds",
49
  }
50
  )
src/chronos_conference/adapters/ts_plot.py CHANGED
@@ -1,16 +1,77 @@
1
- import plotly.express as px
 
 
 
2
  import pandas as pd
 
3
 
4
 
5
  def get_plot(df_story: pd.DataFrame, df_pred: pd.DataFrame):
6
- if pd.api.types.is_datetime64_any_dtype(df_story["datetime"]) is False:
7
- df_story["datetime"] = pd.to_datetime(df_story["datetime"])
 
 
 
 
 
 
 
 
 
 
8
 
9
- if pd.api.types.is_datetime64_any_dtype(df_pred["ds"]) is False:
10
  df_pred["ds"] = pd.to_datetime(df_pred["ds"])
11
 
12
- fig = px.line(df_story, x="datetime", y="value", title="Historical Information")
13
- fig.add_scatter(
14
- x=df_pred["ds"], y=df_pred["AWSChronosForecast"], mode="lines", name="Forecast"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
 
16
  return fig
 
1
+ # ruff: noqa: F403, F405
2
+
3
+ import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
  import pandas as pd
6
+ from chronos_conference.settings import *
7
 
8
 
9
  def get_plot(df_story: pd.DataFrame, df_pred: pd.DataFrame):
10
+ if not pd.api.types.is_datetime64_any_dtype(df_story[HISTORICAL_DATE_COLUMN]):
11
+ df_story[HISTORICAL_DATE_COLUMN] = pd.to_datetime(
12
+ df_story[HISTORICAL_DATE_COLUMN]
13
+ )
14
+
15
+ df_story = df_story.rename(
16
+ columns={
17
+ HISTORICAL_DATE_COLUMN: "datetime",
18
+ HISTORICAL_TARGET_COLUMN: "value",
19
+ HISTORICAL_ITEM_COLUMN: "item_id",
20
+ }
21
+ )
22
 
23
+ if not pd.api.types.is_datetime64_any_dtype(df_pred["ds"]):
24
  df_pred["ds"] = pd.to_datetime(df_pred["ds"])
25
 
26
+ item_ids = df_story["item_id"].unique()
27
+ number_columns = len(item_ids)
28
+
29
+ subplot_titles = [
30
+ f"{item}<br><span style='font-size:10px;color:gray;'>Unidad: {UNITS_MEASURED[item]} | "
31
+ f"Límite máx: {MAX_SAFETY_LIMITS[item]}</span>"
32
+ for item in item_ids
33
+ ]
34
+
35
+ fig = make_subplots(rows=1, cols=number_columns, subplot_titles=subplot_titles)
36
+
37
+ for idx, value in enumerate(item_ids):
38
+ df_story_subset = df_story[df_story["item_id"] == value]
39
+ df_pred_subset = df_pred[df_pred["item_id"] == value]
40
+
41
+ show_legend = idx == 0
42
+
43
+ fig.add_trace(
44
+ go.Scatter(
45
+ x=df_story_subset["datetime"],
46
+ y=df_story_subset["value"],
47
+ mode="lines",
48
+ name="Histórico",
49
+ line=dict(color="blue", width=2),
50
+ showlegend=show_legend,
51
+ ),
52
+ row=1,
53
+ col=idx + 1,
54
+ )
55
+
56
+ fig.add_trace(
57
+ go.Scatter(
58
+ x=df_pred_subset["ds"],
59
+ y=df_pred_subset["AWSChronosForecast"],
60
+ mode="lines+markers",
61
+ name="Predicción",
62
+ line=dict(color="orange", dash="dash"),
63
+ marker=dict(symbol="x", size=8, color="orange"),
64
+ showlegend=show_legend,
65
+ ),
66
+ row=1,
67
+ col=idx + 1,
68
+ )
69
+
70
+ fig.update_layout(
71
+ showlegend=True,
72
+ height=400,
73
+ width=300 * number_columns,
74
+ title_text="Serie histórica y predicción por ítem",
75
  )
76
+
77
  return fig
src/chronos_conference/service_layer/main.py CHANGED
@@ -4,11 +4,12 @@ import streamlit as st
4
  import pandas as pd
5
 
6
  from chronos_conference.domain.inference import get_forecast
7
- from chronos_conference.adapters.filter_ts import filter_ts
8
  from chronos_conference.adapters.model_instance import ChronosForecaster
9
  from chronos_conference.adapters.ts_plot import get_plot
10
  from chronos_conference.settings import *
11
 
 
12
  st.title("AWS Community Day Ecuador 2025")
13
  st.header(
14
  "Conferencia: Aprendiendo el Lenguaje de las series de tiempo con AWS Chronos Bolt"
@@ -22,9 +23,9 @@ datos abiertos obtenidos del INAMHI.
22
  """
23
  )
24
 
25
- df = pd.read_csv(PATH_DATA)
26
 
27
- col1, col2, col3 = st.columns(3)
28
 
29
  with col1:
30
  min_date = st.date_input("Fecha mínima", value=MIN_PRED_DATE)
@@ -32,7 +33,29 @@ with col1:
32
  with col2:
33
  max_date = st.date_input("Fecha máxima", value=MAX_PRED_DATE)
34
 
35
- with col3:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  n_steps = st.number_input(
37
  "Número de pasos a predecir",
38
  min_value=MIN_PRED_DATE_LIMIT,
@@ -40,6 +63,9 @@ with col3:
40
  value=N_PRED_STEPS,
41
  )
42
 
 
 
 
43
  execution_button = st.button("Ejecutar modelo")
44
 
45
  if not execution_button:
@@ -51,6 +77,10 @@ with st.spinner("Filtrando datos..."):
51
  date_col=HISTORICAL_DATE_COLUMN,
52
  min_date=str(min_date),
53
  max_date=str(max_date),
 
 
 
 
54
  )
55
 
56
  model = ChronosForecaster(freq=FREQUENCY)
 
4
  import pandas as pd
5
 
6
  from chronos_conference.domain.inference import get_forecast
7
+ from chronos_conference.adapters.filter_ts import filter_ts, get_properties
8
  from chronos_conference.adapters.model_instance import ChronosForecaster
9
  from chronos_conference.adapters.ts_plot import get_plot
10
  from chronos_conference.settings import *
11
 
12
+
13
  st.title("AWS Community Day Ecuador 2025")
14
  st.header(
15
  "Conferencia: Aprendiendo el Lenguaje de las series de tiempo con AWS Chronos Bolt"
 
23
  """
24
  )
25
 
26
+ df = pd.read_parquet(PATH_DATA)
27
 
28
+ col1, col2 = st.columns(2)
29
 
30
  with col1:
31
  min_date = st.date_input("Fecha mínima", value=MIN_PRED_DATE)
 
33
  with col2:
34
  max_date = st.date_input("Fecha máxima", value=MAX_PRED_DATE)
35
 
36
+
37
+ city_choice = st.selectbox(
38
+ "Seleccione la zona de la ciudad a predecir", ZONES_TO_PREDICT
39
+ )
40
+
41
+ if not city_choice:
42
+ st.stop()
43
+
44
+ available_properties = get_properties(df, ZONE_COL, city_choice)
45
+
46
+ if not available_properties:
47
+ st.stop()
48
+
49
+ col1, col2 = st.columns(2)
50
+
51
+ with col1:
52
+ property_choice = st.pills(
53
+ "Seleccione la propiedad a predecir",
54
+ available_properties,
55
+ selection_mode="multi",
56
+ )
57
+
58
+ with col2:
59
  n_steps = st.number_input(
60
  "Número de pasos a predecir",
61
  min_value=MIN_PRED_DATE_LIMIT,
 
63
  value=N_PRED_STEPS,
64
  )
65
 
66
+ if not property_choice:
67
+ st.stop()
68
+
69
  execution_button = st.button("Ejecutar modelo")
70
 
71
  if not execution_button:
 
77
  date_col=HISTORICAL_DATE_COLUMN,
78
  min_date=str(min_date),
79
  max_date=str(max_date),
80
+ city_col=ZONE_COL,
81
+ city_choice=city_choice,
82
+ property_col=HISTORICAL_ITEM_COLUMN,
83
+ property_choice=property_choice,
84
  )
85
 
86
  model = ChronosForecaster(freq=FREQUENCY)
src/chronos_conference/settings.py CHANGED
@@ -1,13 +1,35 @@
1
- HISTORICAL_DATE_COLUMN = "datetime"
2
- HISTORICAL_ITEM_COLUMN = "unique_id"
3
- HISTORICAL_TARGET_COLUMN = "value"
 
4
 
5
- PATH_DATA = "data/historical_simulation_9023624.csv"
 
6
 
7
- FREQUENCY = "D"
8
 
9
- MIN_PRED_DATE = "2022-01-01"
10
- MAX_PRED_DATE = "2023-06-30"
11
  N_PRED_STEPS = 48
12
  MIN_PRED_DATE_LIMIT = 1
13
  MAX_PRED_DATE_LIMIT = 128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ HISTORICAL_DATE_COLUMN = "ds"
2
+ HISTORICAL_ITEM_COLUMN = "property"
3
+ ZONE_COL = "station"
4
+ HISTORICAL_TARGET_COLUMN = "y"
5
 
6
+ PATH_DATA = "data/datos_ambiente_quito.parquet"
7
+ # PATH_DATA = "/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/datos_ambiente_quito.parquet"
8
 
9
+ FREQUENCY = "h"
10
 
11
+ MIN_PRED_DATE = "2025-08-29 23:00:00"
12
+ MAX_PRED_DATE = "2025-08-31 23:00:00"
13
  N_PRED_STEPS = 48
14
  MIN_PRED_DATE_LIMIT = 1
15
  MAX_PRED_DATE_LIMIT = 128
16
+
17
+ PROPERTIES_TO_PREDICT = ["co", "pm-2.5", "temperature"]
18
+ ZONES_TO_PREDICT = [
19
+ "BELISARIO",
20
+ "CARAPUNGO",
21
+ "CENTRO",
22
+ "COTOCOLLAO",
23
+ "EL CAMAL",
24
+ "LOS CHILLOS",
25
+ "TUMBACO",
26
+ "SAN ANTONIO",
27
+ ]
28
+
29
+ UNITS_MEASURED = {"co": "mg/m3", "pm-2.5": "µg/m3", "temperature": "°C"}
30
+
31
+ MAX_SAFETY_LIMITS = {
32
+ "pm-2.5": "35 µg/m3 daily", # reference: https://ww2.arb.ca.gov/es/resources/inhalable-particulate-matter-and-health
33
+ "co": "23 mg/m3", # reference: https://www.healthcouncil.nl/documents/2024/09/09/carbon-monoxide
34
+ "temperature": "N/A",
35
+ }