-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathforecast.py
150 lines (130 loc) · 6.56 KB
/
forecast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
def split_to_seasons(x):
"""
months to seasons mapping, assuming the climate has 4 distinct seasons
"""
if x in [12,1,2]:
return 'winter'
elif x in [3,4,5]:
return 'spring'
elif x in [6,7,8]:
return 'summer'
elif x in [9,10,11]:
return 'fall'
def time_of_day(x):
"""
hours into morning, highnoon, evening or night
"""
if x in [7, 8, 9, 10]:
return 'morning'
elif x in [11, 12, 13, 14, 15, 16]:
return 'highnoon'
elif x in [17, 18, 19, 20]:
return 'evening'
elif x in [21, 22, 23, 0, 1, 2, 3, 4, 5, 6]:
return 'night'
def forecast(path, forecast_date):
pd.set_option('display.max_columns', 75)
pd.set_option('display.min_rows', 10)
pd.set_option('display.width', 1000)
df = pd.read_excel(path)
date_to_predict = forecast_date
# Turning 24:00 to 00.00 and incrementing the day by 1
twenty_fours = df['TIME'].str[-5:] == '24:00'
df.loc[twenty_fours, 'TIME'] = df['TIME'].str[:-5] + '00:00'
df['TIME'] = pd.to_datetime(df['TIME'], format='%H:%M')
df.loc[twenty_fours, 'DATE'] += pd.DateOffset(1)
df['TIME'] = df['TIME'].dt.strftime('%H:%M')
df.loc[:,'DATE'] = pd.to_datetime(df.DATE.astype(str)+' '+df.TIME.astype(str))
# Equating negative solar radiation values to zero
df['POWER GENERATION (MW)'] = df['POWER GENERATION (MW)'].fillna(0)
negatives = np.where(df['SOLAR RADIATION (W/m2)'] < 0.0)
for i in range(len(negatives)):
df.loc[negatives[i],'SOLAR RADIATION (W/m2)'] = 0.0
# Setting the date as index
df['DATE'] = pd.to_datetime(df['DATE'])
df = df.set_index(['DATE'])
# Getting the hour, month, year, season and time of day data as columns
df['hour'] = df.index.hour.astype('category')
df['month'] = df.index.month.astype('category')
df['year'] = df.index.year.astype('category')
df['season'] = df['month'].apply(split_to_seasons).astype('category')
df['time_of_day'] = df['hour'].apply(time_of_day).astype('category')
# Exogenous variables for yearly, weekly and hourly for seasonality effect
df['year_sin365'] = np.sin(2 * np.pi * df.index.dayofyear / 365)
df['week_sin365'] = np.sin(2 * np.pi * df.index.dayofweek/7)
df['hour_sin365'] = np.sin(2 * np.pi * df.index.hour/24)
#min-max scaling
df_T_min, df_T_max = df['TEMPERATURE (°C)'].min(), df['TEMPERATURE (°C)'].max()
df_solar_min, df_solar_max = df['SOLAR RADIATION (W/m2)'].min(), df['SOLAR RADIATION (W/m2)'].max()
df_cloud_min, df_cloud_max = df['CLOUDNESS (%)'].min(), df['CLOUDNESS (%)'].max()
df['TEMPERATURE (°C)'] = (df['TEMPERATURE (°C)'] - df_T_min) / (df_T_max - df_T_min)
df['SOLAR RADIATION (W/m2)'] = (df['SOLAR RADIATION (W/m2)'] - df_solar_min) / (df_solar_max - df_solar_min)
df['CLOUDNESS (%)'] = (df['CLOUDNESS (%)'] - df_cloud_min) / (df_cloud_max - df_cloud_min)
# Encoding months, seasons and time of days
encoder_month=ce.OneHotEncoder(cols='month',handle_unknown='return_nan',return_df=True,use_cat_names=True)
encoder_season=ce.OneHotEncoder(cols='season',handle_unknown='return_nan',return_df=True,use_cat_names=True)
encoder_time=ce.OneHotEncoder(cols='time_of_day',handle_unknown='return_nan',return_df=True,use_cat_names=True)
df = encoder_month.fit_transform(df)
df = encoder_season.fit_transform(df)
df = encoder_time.fit_transform(df)
'''
# Uncomment this if the actual PV gen data is available for the forecast day
# Shift predicted environment values by 24hr
for i in range(24):
df['lag'+str(i+1)] = df['POWER GENERATION (MW)'].shift(i+1)
df = df.dropna()
'''
# Create the model
random_forest_df = df
y = random_forest_df['POWER GENERATION (MW)']
x = random_forest_df.drop(['POWER GENERATION (MW)'], axis=1).drop(['TIME'], axis=1).drop(['WIND SPEED (m/s)'], axis=1)
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, shuffle=False)
# Random forest model, note that parameters were found by RandomizedSearchCV (a tool for hyper tuning)
rf = RandomForestRegressor(n_estimators = 100, min_samples_split= 3, min_samples_leaf= 4, max_features= None
, max_depth= None, random_state=42)
rf.fit(X_train,y_train)
# Finding the importances of the features
features = df.drop(['POWER GENERATION (MW)'], axis=1).drop(['WIND SPEED (m/s)'], axis=1).drop(['TIME'], axis=1)
feature_list = list(features.columns)
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 6)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
#[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
# Prediction
rf_prediction = rf.predict(X_test)
predicted = y_test
rf_prediction = pd.Series(rf_prediction, index=predicted.index)
act_date_data = y_test[date_to_predict]
predicted_date_data = rf_prediction[date_to_predict]
# Uncomment this if you want to print error metrics
mse = mean_squared_error(y_test, rf_prediction, squared=True)
mae = mean_absolute_error(y_test, rf_prediction)
rmse = mean_squared_error(y_test, rf_prediction, squared=False)
r2 = r2_score(y_test, rf_prediction)
print("r2:", r2)
print("MSE:", mse)
print("MAE:", mae)
print("RMSE:", rmse)
plt.title('Power Generation Forecast')
plt.ylabel('Power Generation (MW)')
plt.grid(True)
plt.plot(act_date_data.index, act_date_data, label="Actual")
plt.plot(predicted_date_data.index, predicted_date_data, label="Forecast")
plt.legend()
plt.show()
return predicted_date_data
# Data file name should be specified in the path input and putting r before the path string may be needed to make the
# path viable. Forecast date should be in YEAR-MONTH-DAY format.
forecast_data = forecast(path, "date")