-
Notifications
You must be signed in to change notification settings - Fork 3
/
neural_network.py
234 lines (212 loc) · 8.58 KB
/
neural_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
Create a neural network to analyse the previous data and predict points using PyTorch
"""
from dateutil import parser
from numpy import random, sqrt, stack
from pandas import read_csv
from torch.utils.data import random_split
from torch.nn import BatchNorm1d, Dropout, Embedding, Linear, Module, ModuleList, MSELoss, ReLU, Sequential
from torch.optim import SGD
from constants import DEFAULT_MODEL_PATH
import logging
import os
import torch
import constants
logger = logging.getLogger()
class Model(Module):
def __init__(self, embedding_size, num_numerical_cols):
super().__init__()
self.all_embeddings = ModuleList([Embedding(ni, nf) for ni, nf in embedding_size])
self.embedding_dropout = Dropout(0.1)
self.batch_norm_num = BatchNorm1d(num_numerical_cols)
all_layers = []
num_categorical_cols = sum((nf for ni, nf in embedding_size))
layer_size = num_categorical_cols + num_numerical_cols
layer_sizes = [100, 50, 25]
for i, new_layer_size in enumerate(layer_sizes):
all_layers.append(Linear(layer_size, new_layer_size))
all_layers.append(ReLU(inplace=True))
all_layers.append(BatchNorm1d(new_layer_size))
all_layers.append(Dropout(0.2))
layer_size = new_layer_size
all_layers.append(Linear(layer_sizes[-1], 1))
self.layers = Sequential(*all_layers)
def forward(self, x_categorical, x_numerical):
embeddings = []
for i, e in enumerate(self.all_embeddings):
embeddings.append(e(x_categorical[:,i]))
x = torch.cat(embeddings, 1)
x = self.embedding_dropout(x)
x_numerical = self.batch_norm_num(x_numerical)
x = torch.cat([x, x_numerical], 1)
x = self.layers(x)
return x
# prepare and load the data
path = './data/cleaned_merged_seasons.csv'
df = read_csv(path, index_col=False, dtype={
'season_x': 'string',
'name': 'string',
'position': 'string',
'team_x': 'string',
'assists': int,
'bonus': int,
'bps': int,
'clean_sheets': int,
'creativity': float,
'element': int,
'fixture': int,
'goals_conceded': int,
'goals_scored': int,
'ict_index': float,
'influence': float,
'kickoff_time': 'string',
'minutes': int,
'opponent_team': int,
'opp_team_name': 'string',
'own_goals': int,
'penalties_missed': int,
'penalties_saved': int,
'red_cards': int,
'round': int,
'saves': int,
'selected': int,
'team_a_score': float,
'team_h_score': float,
'threat': float,
'total_points': int,
'transfers_balance': int,
'transfers_in': int,
'transfers_out': int,
'value': int,
'was_home': bool,
'yellow_cards': int,
'GW': int
})
# remove all values without any minutes played
df = df[df['minutes'] > 0]
# convert kickoff times to epoch timestamps
df['kickoff_time'] = df['kickoff_time'].apply(lambda x: parser.isoparse(x).timestamp())
# define the columns we're interested in
categorical_columns = ['name', 'opp_team_name', 'position', 'season_x', 'was_home']
numerical_columns = ['kickoff_time', 'round', 'value', 'GW']
outputs = ['total_points']
# set categorical columns to have type=category
# these will be converted into unique integers
# calculate the categorical_embedding_size
for category in categorical_columns:
df[category] = df[category].astype('category')
categorical_column_sizes = [];
for column in categorical_columns:
# handle the case where the new season id is not in the data
if column == 'season_x' and constants.CURRENT_SEASON not in df[column].cat.categories:
categorical_column_sizes.append(len(df[column].cat.categories) + 1)
else:
categorical_column_sizes.append(len(df[column].cat.categories))
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]
# split the data 80/20 into training and test data
msk = random.rand(len(df)) < 0.8
training_data = df[msk]
test_data = df[~msk]
# convert data into tensors of the correct format
categorical_training_data = stack([training_data[col].cat.codes.values for col in categorical_columns], 1)
categorical_training_data = torch.tensor(categorical_training_data, dtype=torch.int64)
categorical_test_data = stack([test_data[col].cat.codes.values for col in categorical_columns], 1)
categorical_test_data = torch.tensor(categorical_test_data, dtype=torch.int64)
numerical_training_data = stack([training_data[col].values for col in numerical_columns], 1)
numerical_training_data = torch.tensor(numerical_training_data, dtype=torch.float)
numerical_test_data = stack([test_data[col].values for col in numerical_columns], 1)
numerical_test_data = torch.tensor(numerical_test_data, dtype=torch.float)
training_outputs = torch.tensor(training_data[outputs].values, dtype=torch.float).flatten()
test_outputs = torch.tensor(test_data[outputs].values, dtype=torch.float).flatten()
# define the neural network model
model = Model(categorical_embedding_sizes, len(numerical_columns))
def get_player(player_name):
name_dict = dict(enumerate(df['name'].cat.categories))
player_id = None
for key, value in name_dict.items():
if value == player_name:
player_id = key
break
return player_id
def get_team(team_name):
opp_team_name_dict = dict(enumerate(df['opp_team_name'].cat.categories))
team_id = None
for key, value in opp_team_name_dict.items():
if value == team_name:
team_id = key
break
return team_id
def get_position(position):
position_dict = dict(enumerate(df['position'].cat.categories))
position_id = None
for key, value in position_dict.items():
if value == position:
position_id = key
break
return position_id
def get_was_home(is_home):
was_home_dict = dict(enumerate(df['was_home'].cat.categories))
was_home = None
for key, value in was_home_dict.items():
if key == is_home:
was_home = value
break
return was_home
def get_season_id(season):
season_dict = dict(enumerate(df['season_x'].cat.categories))
season_id = None
for key, value in season_dict.items():
if value == season:
season_id = key
break
# handle the case where the new season id is not in the data
return season_id if season_id is not None else len(season_dict)
# train the model
def train_model():
logger.info('Training the model. This could take a long time...')
model.train()
# define the optimization
loss_function = MSELoss()
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)
epochs = 5000
# enumerate epochs
for epoch in range(epochs):
# compute the model output
predictions = model(categorical_training_data, numerical_training_data).squeeze()
# calculate loss compared to actual outputs
loss = loss_function(predictions, training_outputs)
logger.info('Epoch: {}/{}. Loss: {:.2f}'.format(epoch+1, epochs, loss))
# clear the gradients
optimizer.zero_grad()
# credit assignment
loss.backward()
# update model weights
optimizer.step()
def test_model():
logger.info('Testing the model...')
model.eval()
loss_function = MSELoss()
with torch.no_grad():
predictions = model(categorical_test_data, numerical_test_data).squeeze()
loss = loss_function(predictions, test_outputs)
logger.info('MSE: %.3f, RMSE: %.3f' % (loss, sqrt(loss)))
def predict_points(player_name, opposition_team_name, position, is_home, season, kickoff_time, round, cost, gameweek):
"""
Use the model to make a points prediction given the input data.
"""
model.eval()
with torch.no_grad():
player_id = get_player(player_name)
opposition_team_id = get_team(opposition_team_name)
position_id = get_position(position)
season_id = get_season_id(season)
was_home = get_was_home(is_home)
categorical_data = torch.tensor([[player_id, opposition_team_id, position_id, season_id, was_home]], dtype=torch.int64)
numerical_data = torch.tensor([[parser.isoparse(kickoff_time).timestamp(), round, cost, gameweek]], dtype=torch.float)
return model(categorical_data, numerical_data).squeeze()
def load_model(path=DEFAULT_MODEL_PATH):
logger.info('Loading model from {}'.format(os.path.join(os.getcwd(), path)))
model.load_state_dict(torch.load(path, weights_only=True))
def save_model(path=DEFAULT_MODEL_PATH):
logger.info('Saving model to {}'.format(os.path.join(os.getcwd(), path)))
torch.save(model.state_dict(), path)