In [21]:
import pandas as pd
import numpy as np

In [22]:
data_hist = pd.read_csv('data\in_data_hist.csv')
data_hist.columns

Index(['name', 'datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip',
       'precipprob', 'preciptype', 'snow', 'snowdepth', 'windgust',
       'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility',
       'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'conditions',
       'icon', 'stations'],
      dtype='object')

In [23]:
columns = ['datetime', 'temp', 'feelslike', 'humidity', 'sealevelpressure', 'conditions']

data_hist = data_hist[columns]
data_hist = data_hist.rename(columns={'datetime': 'timestamp',
                                        'temp': 'outdoor_temp',
                                        'feelslike': 'outdoor_feels_like',
                                        'humidity': 'outdoor_humidity',
                                        'sealevelpressure': 'outdoor_pressure',
                                        'conditions': 'outdoor_weather'})

data_hist['timestamp'] = pd.to_datetime(data_hist['timestamp'])
data_hist = data_hist.set_index('timestamp')
data_hist

Unnamed: 0_level_0,outdoor_temp,outdoor_feels_like,outdoor_humidity,outdoor_pressure,outdoor_weather
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-04-24 00:00:00,30.7,38.3,74.75,1006.0,Partially cloudy
2024-04-24 01:00:00,30.3,40.0,83.88,1005.3,Partially cloudy
2024-04-24 02:00:00,30.0,37.4,79.19,1005.0,Partially cloudy
2024-04-24 03:00:00,30.2,37.5,76.90,1005.0,Partially cloudy
2024-04-24 04:00:00,30.3,39.2,81.57,1004.8,Partially cloudy
...,...,...,...,...,...
2024-05-06 19:00:00,32.9,38.8,58.34,1007.0,Partially cloudy
2024-05-06 20:00:00,32.6,39.9,64.24,1007.0,"Rain, Partially cloudy"
2024-05-06 21:00:00,32.6,39.6,63.24,1007.0,Partially cloudy
2024-05-06 22:00:00,32.0,41.1,72.21,1008.0,Partially cloudy


In [24]:
pm10 = pd.read_csv('data/pm10.csv')
pm25 = pd.read_csv('data/pm25.csv')

# use only data from 2023-04-24 to 2023-05-06

pm10['ts'] = pd.to_datetime(pm10['date'])
pm25['ts'] = pd.to_datetime(pm25['date'])

pm10 = pm10[(pm10['ts'] >= '2024-04-24') & (pm10['ts'] <= '2024-05-06')]
pm25 = pm25[(pm25['ts'] >= '2024-04-24') & (pm25['ts'] <= '2024-05-06')]

pm10['mean'] = (pm10['q1'] + pm10['q3'] + pm10['median']) / 3
pm25['mean'] = (pm25['q1'] + pm25['q3'] + pm25['median']) / 3


In [25]:
def generate_hourly_values(min_val, max_val, count):
    interpolated_values = np.linspace(min_val, max_val, count)
    return interpolated_values

def generate_hourly_data(data):
    hourly_data = pd.DataFrame(columns=['ts', 'value'])

    for index, row in data.iterrows():
        hourly_values = generate_hourly_values(row['min'], row['max'], 24)
        hourly_day_data = pd.DataFrame({'ts': pd.date_range(start=row['ts'], periods=24, freq='H'), 'value': hourly_values})
        hourly_data = pd.concat([hourly_data, hourly_day_data])
    hourly_data.reset_index(drop=True, inplace=True)
    return hourly_data

pm10 = generate_hourly_data(pm10)
pm25 = generate_hourly_data(pm25)

In [26]:
pm10 = pm10.rename(columns={'value': 'outdoor_pm10'})
pm25 = pm25.rename(columns={'value': 'outdoor_pm25'})

### Combine

In [27]:
data_hist.reset_index(inplace=True)

pm10['ts'] = pd.to_datetime(pm10['ts'], utc=True)
pm25['ts'] = pd.to_datetime(pm25['ts'], utc=True)
data_hist['timestamp'] = pd.to_datetime(data_hist['timestamp'], utc=True)
data_hist = data_hist.join(pm10.set_index('ts'), on='timestamp')
data_hist = data_hist.join(pm25.set_index('ts'), on='timestamp')

data_hist['outdoor_description'] = np.nan
data_hist['indoor_temp'] = np.nan
data_hist['indoor_light'] = np.nan

data_hist = data_hist[['timestamp', 'outdoor_temp', 'outdoor_feels_like', 'outdoor_pressure',
                       'outdoor_humidity', 'outdoor_weather', 'outdoor_description', 'outdoor_pm25',
                       'outdoor_pm10', 'indoor_temp', 'indoor_light']]

In [28]:
from datetime import datetime

def convert_datetime(datetime_str):
    datetime_obj = datetime.fromisoformat(datetime_str)
    formatted_datetime_str = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
    return formatted_datetime_str

data_hist['timestamp'] = data_hist['timestamp'].apply(lambda x: convert_datetime(str(x)))

In [29]:
data_hist.to_csv('out/data_hist.csv', index=False)