In [1]:
import pandas as pd
from xgboost import XGBRegressor

In [2]:
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error

def plot_predict_vs_actual(test_set, prediction):
 fig = go.Figure()
 fig.add_trace(go.Scatter(x=test_set, y=test_set, mode='lines', name='Ideal Line', line=dict(color='red', dash='dash')))
 fig.add_trace(go.Scatter(x=test_set, y=prediction, mode='markers', name='Predictions', marker=dict(color='blue', opacity=0.5)))
 fig.update_layout(xaxis_title='Actual Values', yaxis_title='Predicted Values', title='Actual vs. Predicted Values', showlegend=True, legend=dict(x=0, y=1))
 fig.update_layout(xaxis=dict(showgrid=True), yaxis=dict(showgrid=True))
 fig.show()


def evaluate_regression(test_set, prediction):
 mse = mean_squared_error(test_set, prediction)
 rmse = mean_squared_error(test_set, prediction, squared=False)
 mae = mean_absolute_error(test_set, prediction)
 min_actual = min(test_set)
 max_actual = max(test_set)
 min_pred = min(prediction)
 max_pred = max(prediction)

 print('Mean Squared Error (MSE):', mse)
 print('Root Mean Squared Error (RMSE):', rmse)
 print('Mean Absolute Error (MAE):', mae)
 print('Range of Actual Values:', min_actual, '-', max_actual)
 print('Range of Predicted Values:', min_pred, '-', max_pred)

In [3]:
data = pd.read_csv('out/data_cleaned.csv')

In [4]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['indoor_temp', 'indoor_light', 'outdoor_weather', 'timestamp', 'outdoor_pm25', 'outdoor_pm10'])
y = data['indoor_temp']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgboost = XGBRegressor(n_estimators=1000, max_depth=5, subsample=0.5, colsample_bytree=0.5, reg_alpha=0.1, reg_lambda=0.1, random_state=42)
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
evaluate_regression(y_test, y_pred)

Mean Squared Error (MSE): 0.15151159842304576
Root Mean Squared Error (RMSE): 0.3892449080245569
Mean Absolute Error (MAE): 0.25791334929289644
Range of Actual Values: 27.0 - 33.48197937011719
Range of Predicted Values: 26.980206 - 33.49997


In [5]:
plot_predict_vs_actual(y_test, y_pred)

In [6]:
import joblib

joblib.dump(xgboost, 'out/xgboost_model.pkl')

['out/xgboost_model.pkl']

In [7]:
# load model

# xgboost = joblib.load('out/xgboost_model.pkl')