Developing an ML Model to Predict the Energy Efficiency of Buildings

Predicting the Energy Star Score of Buildings Using a Random Forest Regressor

Data Cleaning

def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
   print("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")

Exploring our Data

# Find all correlations and sort
correlations_data = data.corr()['score'].sort_values()

# Print the most negative correlations
print(correlations_data.head(15), '\n')

# Print the most positive correlations
print(correlations_data.tail(15))

Feature Engineering and Selection

# Select the categorical columns
categorical_subset = data[['Borough', 'Largest Property Use Type']]
# one hot encode
categorical_subset = pd.get_dummies(categorical_subset)
# Create columns with log of numeric columns
for col in numeric_subset.columns:
# Skip the Energy Star Score column
if col == 'score':
next
else:
numeric_subset['log_' + col] = np.log(numeric_subset[col])
# Join the two dataframes using concat
features = pd.concat([numeric_subset, categorical_subset], axis = 1)
print(features.shape)
# Remove the collinear features above a specified correlation coefficient
features = remove_collinear_features(features, 0.6);

# Remove any columns with all na values
features = features.dropna(axis=1, how = 'all')
features.shape
(11319, 65)

Naive Baseline

X, X_test, y, y_test = train_test_split(features, targets, test_size = 0.3, random_state = 42)
def mae(y_true, y_pred):
return np.mean(abs(y_true - y_pred))
baseline_guess = np.median(y)

Evaluating and Comparing Models

lr = LinearRegression()
lr_mae = fit_and_evaluate(lr)

'Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae

svm = SVR(C = 1.0, gamma = 0.1)
svm_mae = fit_and_evaluate(svm)

'Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae

random_forest = RandomForestRegressor(random_state=50)
random_forest_mae = fit_and_evaluate(random_forest)

'Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae

gradient_boosted = GradientBoostingRegressor(random_state=50)
gradient_boosted_mae = fit_and_evaluate(gradient_boosted)

'Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae

knn = KNeighborsRegressor(n_neighbors=10)
knn_mae = fit_and_evaluate(knn)

'K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae

Hyperparameters

# hyperparameters
# Number of trees used in the boosting process
n_estimators = [100, 200, 400, 500, 800]
# Maximum depth of each tree
max_depth = [2, 4, 5, 8, 12]
# Minimum number of samples per leaf
min_samples_leaf = [1, 2, 4, 6, 8]
# Minimum number of samples to split a node
min_samples_split = [2, 4, 6, 8, 10]
# Maximum number of features to consider for making splits
max_features = ['auto', 'sqrt', 'log2', None]
# Define the grid of hyperparameters to search
hyperparameter_grid = {'n_estimators': n_estimators,
'max_depth': max_depth,
'min_samples_leaf': min_samples_leaf,
'min_samples_split': min_samples_split,
'max_features': max_features}
# Create the model to use for hyperparameter tuning
model = RandomForestRegressor(random_state = 42)
# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=model,
param_distributions=hyperparameter_grid,
cv=4, n_iter=25,
scoring = 'neg_mean_absolute_error',
n_jobs = -1, verbose = 1,
return_train_score = True,
random_state=42)
# Fit on the training data
random_cv.fit(X, y)

Testing Out The Model

# Make predictions on the test set using default and final model
default_pred = default_model.predict(X_test)
final_pred = final_model.predict(X_test)

Locally Interpretable Model-agnostic Explanations

# Display the predicted and true value for the wrong instance
print('Prediction: %0.4f' % model.predict(wrong.reshape(1, -1)))
print('Actual Value: %0.4f' % y_test[np.argmax(residuals)])

# Explanation for wrong prediction
wrong_exp = explainer.explain_instance(data_row = wrong,
predict_fn = model.predict)
Prediction: 21.6300
Actual Value: 94.0000

Conclusion

Thank you so much for reading this! I’m a 15-year-old passionate about sustainability, and am the author of “Chronicles of Illusions: The Blue Wild”. If you want to see more of my work, connect with me on LinkedIn, Twitter, or subscribe to my monthly newsletter!

--

--

15 y/o with the goal of playing a role in ending our global consumption of fossil fuels

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Naila Moloo

15 y/o with the goal of playing a role in ending our global consumption of fossil fuels