HomieCare/train/fill_missing.ipynb

360 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['name', 'datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip',\n",
" 'precipprob', 'preciptype', 'snow', 'snowdepth', 'windgust',\n",
" 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility',\n",
" 'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'conditions',\n",
" 'icon', 'stations'],\n",
" dtype='object')"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_hist = pd.read_csv('data\\in_data_hist.csv')\n",
"data_hist.columns"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>outdoor_temp</th>\n",
" <th>outdoor_feels_like</th>\n",
" <th>outdoor_humidity</th>\n",
" <th>outdoor_pressure</th>\n",
" <th>outdoor_weather</th>\n",
" </tr>\n",
" <tr>\n",
" <th>timestamp</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2024-04-24 00:00:00</th>\n",
" <td>30.7</td>\n",
" <td>38.3</td>\n",
" <td>74.75</td>\n",
" <td>1006.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-24 01:00:00</th>\n",
" <td>30.3</td>\n",
" <td>40.0</td>\n",
" <td>83.88</td>\n",
" <td>1005.3</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-24 02:00:00</th>\n",
" <td>30.0</td>\n",
" <td>37.4</td>\n",
" <td>79.19</td>\n",
" <td>1005.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-24 03:00:00</th>\n",
" <td>30.2</td>\n",
" <td>37.5</td>\n",
" <td>76.90</td>\n",
" <td>1005.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-04-24 04:00:00</th>\n",
" <td>30.3</td>\n",
" <td>39.2</td>\n",
" <td>81.57</td>\n",
" <td>1004.8</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-05-06 19:00:00</th>\n",
" <td>32.9</td>\n",
" <td>38.8</td>\n",
" <td>58.34</td>\n",
" <td>1007.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-05-06 20:00:00</th>\n",
" <td>32.6</td>\n",
" <td>39.9</td>\n",
" <td>64.24</td>\n",
" <td>1007.0</td>\n",
" <td>Rain, Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-05-06 21:00:00</th>\n",
" <td>32.6</td>\n",
" <td>39.6</td>\n",
" <td>63.24</td>\n",
" <td>1007.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-05-06 22:00:00</th>\n",
" <td>32.0</td>\n",
" <td>41.1</td>\n",
" <td>72.21</td>\n",
" <td>1008.0</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2024-05-06 23:00:00</th>\n",
" <td>32.0</td>\n",
" <td>40.5</td>\n",
" <td>70.65</td>\n",
" <td>1008.6</td>\n",
" <td>Partially cloudy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>312 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" outdoor_temp outdoor_feels_like outdoor_humidity \\\n",
"timestamp \n",
"2024-04-24 00:00:00 30.7 38.3 74.75 \n",
"2024-04-24 01:00:00 30.3 40.0 83.88 \n",
"2024-04-24 02:00:00 30.0 37.4 79.19 \n",
"2024-04-24 03:00:00 30.2 37.5 76.90 \n",
"2024-04-24 04:00:00 30.3 39.2 81.57 \n",
"... ... ... ... \n",
"2024-05-06 19:00:00 32.9 38.8 58.34 \n",
"2024-05-06 20:00:00 32.6 39.9 64.24 \n",
"2024-05-06 21:00:00 32.6 39.6 63.24 \n",
"2024-05-06 22:00:00 32.0 41.1 72.21 \n",
"2024-05-06 23:00:00 32.0 40.5 70.65 \n",
"\n",
" outdoor_pressure outdoor_weather \n",
"timestamp \n",
"2024-04-24 00:00:00 1006.0 Partially cloudy \n",
"2024-04-24 01:00:00 1005.3 Partially cloudy \n",
"2024-04-24 02:00:00 1005.0 Partially cloudy \n",
"2024-04-24 03:00:00 1005.0 Partially cloudy \n",
"2024-04-24 04:00:00 1004.8 Partially cloudy \n",
"... ... ... \n",
"2024-05-06 19:00:00 1007.0 Partially cloudy \n",
"2024-05-06 20:00:00 1007.0 Rain, Partially cloudy \n",
"2024-05-06 21:00:00 1007.0 Partially cloudy \n",
"2024-05-06 22:00:00 1008.0 Partially cloudy \n",
"2024-05-06 23:00:00 1008.6 Partially cloudy \n",
"\n",
"[312 rows x 5 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns = ['datetime', 'temp', 'feelslike', 'humidity', 'sealevelpressure', 'conditions']\n",
"\n",
"data_hist = data_hist[columns]\n",
"data_hist = data_hist.rename(columns={'datetime': 'timestamp',\n",
" 'temp': 'outdoor_temp',\n",
" 'feelslike': 'outdoor_feels_like',\n",
" 'humidity': 'outdoor_humidity',\n",
" 'sealevelpressure': 'outdoor_pressure',\n",
" 'conditions': 'outdoor_weather'})\n",
"\n",
"data_hist['timestamp'] = pd.to_datetime(data_hist['timestamp'])\n",
"data_hist = data_hist.set_index('timestamp')\n",
"data_hist"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"pm10 = pd.read_csv('data/pm10.csv')\n",
"pm25 = pd.read_csv('data/pm25.csv')\n",
"\n",
"# use only data from 2023-04-24 to 2023-05-06\n",
"\n",
"pm10['ts'] = pd.to_datetime(pm10['date'])\n",
"pm25['ts'] = pd.to_datetime(pm25['date'])\n",
"\n",
"pm10 = pm10[(pm10['ts'] >= '2024-04-24') & (pm10['ts'] <= '2024-05-06')]\n",
"pm25 = pm25[(pm25['ts'] >= '2024-04-24') & (pm25['ts'] <= '2024-05-06')]\n",
"\n",
"pm10['mean'] = (pm10['q1'] + pm10['q3'] + pm10['median']) / 3\n",
"pm25['mean'] = (pm25['q1'] + pm25['q3'] + pm25['median']) / 3\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"def generate_hourly_values(min_val, max_val, count):\n",
" interpolated_values = np.linspace(min_val, max_val, count)\n",
" return interpolated_values\n",
"\n",
"def generate_hourly_data(data):\n",
" hourly_data = pd.DataFrame(columns=['ts', 'value'])\n",
"\n",
" for index, row in data.iterrows():\n",
" hourly_values = generate_hourly_values(row['min'], row['max'], 24)\n",
" hourly_day_data = pd.DataFrame({'ts': pd.date_range(start=row['ts'], periods=24, freq='H'), 'value': hourly_values})\n",
" hourly_data = pd.concat([hourly_data, hourly_day_data])\n",
" hourly_data.reset_index(drop=True, inplace=True)\n",
" return hourly_data\n",
"\n",
"pm10 = generate_hourly_data(pm10)\n",
"pm25 = generate_hourly_data(pm25)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"pm10 = pm10.rename(columns={'value': 'outdoor_pm10'})\n",
"pm25 = pm25.rename(columns={'value': 'outdoor_pm25'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combine"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"data_hist.reset_index(inplace=True)\n",
"\n",
"pm10['ts'] = pd.to_datetime(pm10['ts'], utc=True)\n",
"pm25['ts'] = pd.to_datetime(pm25['ts'], utc=True)\n",
"data_hist['timestamp'] = pd.to_datetime(data_hist['timestamp'], utc=True)\n",
"data_hist = data_hist.join(pm10.set_index('ts'), on='timestamp')\n",
"data_hist = data_hist.join(pm25.set_index('ts'), on='timestamp')\n",
"\n",
"data_hist['outdoor_description'] = np.nan\n",
"data_hist['indoor_temp'] = np.nan\n",
"data_hist['indoor_light'] = np.nan\n",
"\n",
"data_hist = data_hist[['timestamp', 'outdoor_temp', 'outdoor_feels_like', 'outdoor_pressure',\n",
" 'outdoor_humidity', 'outdoor_weather', 'outdoor_description', 'outdoor_pm25',\n",
" 'outdoor_pm10', 'indoor_temp', 'indoor_light']]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"def convert_datetime(datetime_str):\n",
" datetime_obj = datetime.fromisoformat(datetime_str)\n",
" formatted_datetime_str = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')\n",
" return formatted_datetime_str\n",
"\n",
"data_hist['timestamp'] = data_hist['timestamp'].apply(lambda x: convert_datetime(str(x)))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"data_hist.to_csv('out/data_hist.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}