{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_lluvia = pd.read_excel(\n", " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/CO.xlsx\",\n", " skiprows=0,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select useful raws" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_lluvia = df_lluvia[1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Rename the dates" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df_lluvia = df_lluvia.rename(columns={\"Unnamed: 0\": \"ds\"})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df_lluvia[\"ds\"] = pd.to_datetime(df_lluvia[\"ds\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df_lluvia_melted = pd.melt(\n", " df_lluvia, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\"\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df_lluvia_melted = df_lluvia_melted.dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Seleccionar los lugares disponibles para la lluvia" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "places_lluvia = df_lluvia_melted[\"station\"].unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['BELISARIO' 'CARAPUNGO' 'CENTRO' 'COTOCOLLAO' 'EL CAMAL' 'GUAMANI'\n", " 'LOS CHILLOS' 'TUMBACO' 'CONDADO' 'TURUBAMBA']\n" ] } ], "source": [ "print(places_lluvia)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See the min and max dates for the lluvia places" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "stats_lluvia = df_lluvia_melted.groupby(\"station\").agg({\"ds\": [\"min\", \"max\"]})" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ds
minmax
station
BELISARIO2004-01-01 00:00:002025-09-30 23:00:00
CARAPUNGO2005-03-16 15:00:002025-09-30 23:00:00
CENTRO2004-01-01 00:00:002025-09-30 23:00:00
CONDADO2004-01-01 00:00:002005-02-21 09:00:00
COTOCOLLAO2005-02-25 14:00:002025-09-30 23:00:00
EL CAMAL2004-01-01 00:00:002025-09-30 23:00:00
GUAMANI2005-04-19 15:00:002025-06-18 08:00:00
LOS CHILLOS2014-01-21 00:00:002025-09-30 12:00:00
TUMBACO2019-06-10 17:00:002025-09-28 07:00:00
TURUBAMBA2004-01-01 00:00:002005-03-08 09:00:00
\n", "
" ], "text/plain": [ " ds \n", " min max\n", "station \n", "BELISARIO 2004-01-01 00:00:00 2025-09-30 23:00:00\n", "CARAPUNGO 2005-03-16 15:00:00 2025-09-30 23:00:00\n", "CENTRO 2004-01-01 00:00:00 2025-09-30 23:00:00\n", "CONDADO 2004-01-01 00:00:00 2005-02-21 09:00:00\n", "COTOCOLLAO 2005-02-25 14:00:00 2025-09-30 23:00:00\n", "EL CAMAL 2004-01-01 00:00:00 2025-09-30 23:00:00\n", "GUAMANI 2005-04-19 15:00:00 2025-06-18 08:00:00\n", "LOS CHILLOS 2014-01-21 00:00:00 2025-09-30 12:00:00\n", "TUMBACO 2019-06-10 17:00:00 2025-09-28 07:00:00\n", "TURUBAMBA 2004-01-01 00:00:00 2005-03-08 09:00:00" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stats_lluvia" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "useful_places_lluvia = stats_lluvia[\n", " stats_lluvia[(\"ds\", \"max\")] >= \"2025-09-28 07:00:00\"\n", "].index" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df_lluvia_melted = df_lluvia_melted[\n", " df_lluvia_melted[\"station\"].isin(useful_places_lluvia)\n", "]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "df_lluvia_melted[\"y\"] = df_lluvia_melted[\"y\"].apply(\n", " lambda x: np.nan if x == \" \" else x\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df_lluvia_melted[\"property\"] = \"co\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## PM 2.5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read the pm2.5 dataframe" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df_pm = pd.read_excel(\n", " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/PM2.5.xlsx\"\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "df_pm = df_pm[1:]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df_pm = df_pm.rename(columns={\"Unnamed: 0\": \"ds\"})" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "df_pm_melted = pd.melt(df_pm, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "df_pm_melted[\"y\"] = df_pm_melted[\"y\"].apply(\n", " lambda x: np.nan if x == \" \" else x\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "df_pm_melted = df_pm_melted.dropna()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "places_pm = df_pm_melted[\"station\"].unique()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['BELISARIO' 'CARAPUNGO' 'CENTRO' 'COTOCOLLAO' 'EL CAMAL' 'GUAMANI'\n", " 'LOS CHILLOS' 'SAN ANTONIO' 'TUMBACO' 'TURUBAMBA']\n" ] } ], "source": [ "print(places_pm)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "df_pm_melted[\"ds\"] = pd.to_datetime(df_pm_melted[\"ds\"])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "metric_dates = df_pm_melted.groupby([\"station\"]).agg({\"ds\": [\"min\", \"max\"]})" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ds
minmax
station
BELISARIO2004-09-03 17:00:002025-08-31 23:00:00
CARAPUNGO2005-03-16 00:00:002025-08-31 23:00:00
CENTRO2004-08-26 15:00:002025-08-31 23:00:00
COTOCOLLAO2005-02-25 10:00:002025-08-31 23:00:00
EL CAMAL2004-08-26 17:00:002025-08-31 23:00:00
GUAMANI2013-10-28 00:00:002025-06-18 08:00:00
LOS CHILLOS2014-01-21 00:00:002025-03-31 14:00:00
SAN ANTONIO2017-03-29 00:00:002025-08-31 23:00:00
TUMBACO2017-03-07 13:00:002025-08-31 23:00:00
TURUBAMBA2004-10-04 17:00:002005-03-08 09:00:00
\n", "
" ], "text/plain": [ " ds \n", " min max\n", "station \n", "BELISARIO 2004-09-03 17:00:00 2025-08-31 23:00:00\n", "CARAPUNGO 2005-03-16 00:00:00 2025-08-31 23:00:00\n", "CENTRO 2004-08-26 15:00:00 2025-08-31 23:00:00\n", "COTOCOLLAO 2005-02-25 10:00:00 2025-08-31 23:00:00\n", "EL CAMAL 2004-08-26 17:00:00 2025-08-31 23:00:00\n", "GUAMANI 2013-10-28 00:00:00 2025-06-18 08:00:00\n", "LOS CHILLOS 2014-01-21 00:00:00 2025-03-31 14:00:00\n", "SAN ANTONIO 2017-03-29 00:00:00 2025-08-31 23:00:00\n", "TUMBACO 2017-03-07 13:00:00 2025-08-31 23:00:00\n", "TURUBAMBA 2004-10-04 17:00:00 2005-03-08 09:00:00" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metric_dates" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "useful_places_pm = metric_dates[\n", " metric_dates[(\"ds\", \"max\")] == \"2025-08-31 23:00:00\"\n", "].index" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df_pm_melted = df_pm_melted[df_pm_melted[\"station\"].isin(useful_places_pm)]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "df_pm_melted[\"property\"] = \"pm-2.5\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Temperature" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "df_temp = pd.read_excel(\n", " \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data/ambiente_quito/TMP.xlsx\"\n", ")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "df_temp = df_temp[1:]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "df_temp = df_temp.rename(columns={\"Unnamed: 0\": \"ds\"})" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "df_temp_melted = pd.melt(df_temp, id_vars=[\"ds\"], var_name=\"station\", value_name=\"y\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "df_temp_melted = df_temp_melted.dropna()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df_temp_melted[\"ds\"] = pd.to_datetime(df_temp_melted[\"ds\"])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "metrics_temp = df_temp_melted.groupby([\"station\"]).agg({\"ds\": [\"min\", \"max\"]})" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "useful_places_temp = metrics_temp[\n", " metrics_temp[(\"ds\", \"max\")] == pd.to_datetime(\"2025-09-30 23:00:00\")\n", "].index" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "df_temp_melted = df_temp_melted[df_temp_melted[\"station\"].isin(useful_places_temp)]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "df_temp_melted[\"property\"] = \"temperature\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Concat to have a single dataframe" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "df_final = pd.concat([df_lluvia_melted, df_pm_melted, df_temp_melted])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "df_final[\"station\"] = df_final[\"station\"].apply(lambda x: x.upper())" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "df_final = df_final[df_final[\"ds\"] <= pd.to_datetime(\"2025-08-31 23:00:00\")]" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "df_final[\"station\"] = df_final[\"station\"].apply(\n", " lambda x: \"SAN ANTONIO\" if x == \"SANANTONIO\" else x\n", ")\n", "\n", "df_final[\"station\"] = df_final[\"station\"].apply(\n", " lambda x: \"EL CAMAL\" if x == \"ELCAMAL\" else x\n", ")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['BELISARIO', 'CARAPUNGO', 'CENTRO', 'COTOCOLLAO', 'EL CAMAL',\n", " 'LOS CHILLOS', 'TUMBACO', 'SAN ANTONIO'], dtype=object)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_final[\"station\"].unique()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "FINAL_PATH = \"/Users/sebastianalejandrosarastizambonino/Documents/conferences/aws_community_day_2025/data\"\n", "df_final.to_parquet(f\"{FINAL_PATH}/datos_ambiente_quito.parquet\")" ] } ], "metadata": { "kernelspec": { "display_name": "aws_conf", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }