Libraries

import warnings
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import numpy as np
import pandas as pd
pd.set_option("display.max.columns", None)
pd.set_option("display.max_colwidth", None)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use("ggplot")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import (LinearRegression, 
                                  Ridge, 
                                  Lasso)
from sklearn.metrics import (r2_score, 
                             mean_absolute_error, 
                             mean_squared_error)

The Dataset

The dataset is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).

energy = pd.read_csv("datasets/energydata_complete.csv")
energy.head()
date Appliances lights T1 RH_1 T2 RH_2 T3 RH_3 T4 RH_4 T5 RH_5 T6 RH_6 T7 RH_7 T8 RH_8 T9 RH_9 T_out Press_mm_hg RH_out Windspeed Visibility Tdewpoint rv1 rv2
0 2016-01-11 17:00:00 60 30 19.89 47.596667 19.2 44.790000 19.79 44.730000 19.000000 45.566667 17.166667 55.20 7.026667 84.256667 17.200000 41.626667 18.2 48.900000 17.033333 45.53 6.600000 733.5 92.0 7.000000 63.000000 5.3 13.275433 13.275433
1 2016-01-11 17:10:00 60 30 19.89 46.693333 19.2 44.722500 19.79 44.790000 19.000000 45.992500 17.166667 55.20 6.833333 84.063333 17.200000 41.560000 18.2 48.863333 17.066667 45.56 6.483333 733.6 92.0 6.666667 59.166667 5.2 18.606195 18.606195
2 2016-01-11 17:20:00 50 30 19.89 46.300000 19.2 44.626667 19.79 44.933333 18.926667 45.890000 17.166667 55.09 6.560000 83.156667 17.200000 41.433333 18.2 48.730000 17.000000 45.50 6.366667 733.7 92.0 6.333333 55.333333 5.1 28.642668 28.642668
3 2016-01-11 17:30:00 50 40 19.89 46.066667 19.2 44.590000 19.79 45.000000 18.890000 45.723333 17.166667 55.09 6.433333 83.423333 17.133333 41.290000 18.1 48.590000 17.000000 45.40 6.250000 733.8 92.0 6.000000 51.500000 5.0 45.410389 45.410389
4 2016-01-11 17:40:00 60 40 19.89 46.333333 19.2 44.530000 19.79 45.000000 18.890000 45.530000 17.200000 55.09 6.366667 84.893333 17.200000 41.230000 18.1 48.590000 17.000000 45.40 6.133333 733.9 92.0 5.666667 47.666667 4.9 10.084097 10.084097

Dataset Description

The attribute information can be seen below.

Attribute Information:

Attribute Description Units
Date time year-month-day hour\:minute:second
Appliances energy use in Wh
lights energy use of light fixtures in the house in Wh
T1 Temperature in kitchen area in Celsius
RH_1 Humidity in kitchen area in %
T2 Temperature in living room area in Celsius
RH_2 Humidity in living room area in %
T3 Temperature in laundry room area
RH_3 Humidity in laundry room area in %
T4 Temperature in office room in Celsius
RH_4 Humidity in office room in %
T5 Temperature in bathroom in Celsius
RH_5 Humidity in bathroom in %
T6 Temperature outside the building (north side) in Celsius
RH_6 Humidity outside the building (north side) in %
T7 Temperature in ironing room in Celsius
RH_7 Humidity in ironing room in %
T8 Temperature in teenager room 2 in Celsius
RH_8 Humidity in teenager room 2 in %
T9 Temperature in parents room in Celsius
RH_9 Humidity in parents room in %
To Temperature outside (from Chievres weather station) in Celsius
Pressure (from Chievres weather station) in mm Hg
RH_out Humidity outside (from Chievres weather station) in %
Wind speed (from Chievres weather station) in m/s
Visibility (from Chievres weather station) in km
Tdewpoint (from Chievres weather station) Â °C
rv1 Random variable 1 nondimensional
rv2 Random variable 2 nondimensional
energy.describe()
Appliances lights T1 RH_1 T2 RH_2 T3 RH_3 T4 RH_4 T5 RH_5 T6 RH_6 T7 RH_7 T8 RH_8 T9 RH_9 T_out Press_mm_hg RH_out Windspeed Visibility Tdewpoint rv1 rv2
count 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000 19735.000000
mean 97.694958 3.801875 21.686571 40.259739 20.341219 40.420420 22.267611 39.242500 20.855335 39.026904 19.592106 50.949283 7.910939 54.609083 20.267106 35.388200 22.029107 42.936165 19.485828 41.552401 7.411665 755.522602 79.750418 4.039752 38.330834 3.760707 24.988033 24.988033
std 102.524891 7.935988 1.606066 3.979299 2.192974 4.069813 2.006111 3.254576 2.042884 4.341321 1.844623 9.022034 6.090347 31.149806 2.109993 5.114208 1.956162 5.224361 2.014712 4.151497 5.317409 7.399441 14.901088 2.451221 11.794719 4.194648 14.496634 14.496634
min 10.000000 0.000000 16.790000 27.023333 16.100000 20.463333 17.200000 28.766667 15.100000 27.660000 15.330000 29.815000 -6.065000 1.000000 15.390000 23.200000 16.306667 29.600000 14.890000 29.166667 -5.000000 729.300000 24.000000 0.000000 1.000000 -6.600000 0.005322 0.005322
25% 50.000000 0.000000 20.760000 37.333333 18.790000 37.900000 20.790000 36.900000 19.530000 35.530000 18.277500 45.400000 3.626667 30.025000 18.700000 31.500000 20.790000 39.066667 18.000000 38.500000 3.666667 750.933333 70.333333 2.000000 29.000000 0.900000 12.497889 12.497889
50% 60.000000 0.000000 21.600000 39.656667 20.000000 40.500000 22.100000 38.530000 20.666667 38.400000 19.390000 49.090000 7.300000 55.290000 20.033333 34.863333 22.100000 42.375000 19.390000 40.900000 6.916667 756.100000 83.666667 3.666667 40.000000 3.433333 24.897653 24.897653
75% 100.000000 0.000000 22.600000 43.066667 21.500000 43.260000 23.290000 41.760000 22.100000 42.156667 20.619643 53.663333 11.256000 83.226667 21.600000 39.000000 23.390000 46.536000 20.600000 44.338095 10.408333 760.933333 91.666667 5.500000 40.000000 6.566667 37.583769 37.583769
max 1080.000000 70.000000 26.260000 63.360000 29.856667 56.026667 29.236000 50.163333 26.200000 51.090000 25.795000 96.321667 28.290000 99.900000 26.000000 51.400000 27.230000 58.780000 24.500000 53.326667 26.100000 772.300000 100.000000 14.000000 66.000000 15.500000 49.996530 49.996530
energy.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9           19735 non-null  float64
 20  RH_9         19735 non-null  float64
 21  T_out        19735 non-null  float64
 22  Press_mm_hg  19735 non-null  float64
 23  RH_out       19735 non-null  float64
 24  Windspeed    19735 non-null  float64
 25  Visibility   19735 non-null  float64
 26  Tdewpoint    19735 non-null  float64
 27  rv1          19735 non-null  float64
 28  rv2          19735 non-null  float64
dtypes: float64(26), int64(2), object(1)
memory usage: 4.4+ MB

There are no missing values in the dataset.

scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(energy.drop(columns=['date', 'lights'])), 
                             columns=energy.drop(columns=['date', 'lights']).columns)
features_df = normalised_df.drop(columns=['Appliances'])
energy_target = normalised_df.Appliances
X_train, X_test, y_train, y_test = train_test_split(features_df, energy_target, test_size=.3, random_state=42)

From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6).

lin_reg = LinearRegression()
lin_reg.fit(X_train[['T2']], X_train.T6)
T6_pred = lin_reg.predict(X_test[['T2']])
print(f'r^2 score: {round(r2_score(X_test.T6, T6_pred), 2)}')
r^2 score: 0.64
print(f'MAE: {round(mean_absolute_error(X_test.T6, T6_pred), 2)}')
MAE: 0.08
print(f'Residual Sum of Squares: {round(np.sum(np.square(X_test.T6 - T6_pred)), 2)}')
Residual Sum of Squares: 274.9
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(X_test.T6, T6_pred)), 3)}')
Root Mean Squared Error: 0.215
energy.drop(columns=['date', 'lights']).max().sort_values()
Windspeed        14.000000
Tdewpoint        15.500000
T9               24.500000
T5               25.795000
T7               26.000000
T_out            26.100000
T4               26.200000
T1               26.260000
T8               27.230000
T6               28.290000
T3               29.236000
T2               29.856667
rv1              49.996530
rv2              49.996530
RH_3             50.163333
RH_4             51.090000
RH_7             51.400000
RH_9             53.326667
RH_2             56.026667
RH_8             58.780000
RH_1             63.360000
Visibility       66.000000
RH_5             96.321667
RH_6             99.900000
RH_out          100.000000
Press_mm_hg     772.300000
Appliances     1080.000000
dtype: float64
energy.drop(columns=['date', 'lights']).min().sort_values()
Tdewpoint       -6.600000
T6              -6.065000
T_out           -5.000000
Windspeed        0.000000
rv2              0.005322
rv1              0.005322
Visibility       1.000000
RH_6             1.000000
Appliances      10.000000
T9              14.890000
T4              15.100000
T5              15.330000
T7              15.390000
T2              16.100000
T8              16.306667
T1              16.790000
T3              17.200000
RH_2            20.463333
RH_7            23.200000
RH_out          24.000000
RH_1            27.023333
RH_4            27.660000
RH_3            28.766667
RH_9            29.166667
RH_8            29.600000
RH_5            29.815000
Press_mm_hg    729.300000
dtype: float64
def get_weights_df(model, feat, col_name):
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)
Ridge(alpha=0.4)
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
Lasso(alpha=0.001)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
linear_model_weights = get_weights_df(model, X_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge_reg, X_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso_reg, X_train, 'Lasso_weight')

final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')
final_weights.sort_values("Linear_Model_Weight", ascending=False)
Features Linear_Model_Weight Ridge_Weight Lasso_weight
25 RH_1 0.553547 0.519525 0.017880
24 T3 0.290627 0.288087 0.000000
23 T6 0.236425 0.217292 0.000000
22 Tdewpoint 0.117758 0.083128 0.000000
21 T8 0.101995 0.101028 0.000000
20 RH_3 0.096048 0.095135 0.000000
19 RH_6 0.038049 0.035519 -0.000000
18 Windspeed 0.029183 0.030268 0.002912
17 T4 0.028981 0.027384 -0.000000
16 RH_4 0.026386 0.024579 0.000000
15 RH_5 0.016006 0.016152 0.000000
14 Visibility 0.012307 0.012076 0.000000
13 T7 0.010319 0.010098 -0.000000
12 Press_mm_hg 0.006839 0.006584 -0.000000
11 rv2 0.000770 0.000748 -0.000000
10 rv1 0.000770 0.000748 -0.000000
9 T1 -0.003281 -0.018406 0.000000
8 T5 -0.015657 -0.019853 -0.000000
7 RH_9 -0.039800 -0.041367 -0.000000
6 RH_7 -0.044614 -0.045977 -0.000000
5 RH_out -0.077671 -0.054724 -0.049557
4 RH_8 -0.157595 -0.156830 -0.000110
3 T9 -0.189941 -0.188916 -0.000000
2 T2 -0.236178 -0.201397 0.000000
1 T_out -0.321860 -0.262172 0.000000
0 RH_2 -0.456698 -0.411071 -0.000000
y_pred_lg = model.predict(X_test)
y_pred_r = ridge_reg.predict(X_test)
y_pred_l = lasso_reg.predict(X_test)
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(y_test, y_pred_r)), 3)}')
Root Mean Squared Error: 0.088
print(f'Root Mean Squared Error: {round(np.sqrt(mean_squared_error(y_test, y_pred_l)), 3)}')
Root Mean Squared Error: 0.094