-
Notifications
You must be signed in to change notification settings - Fork 0
/
sportsanalytics.py
222 lines (184 loc) · 7.63 KB
/
sportsanalytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Sports Analytics
"""
import numeric
import codeskulptor
from urllib import request
import comp140_module6 as sports
def read_matrix(filename):
"""
Parse data from the file with the given filename into a matrix.
input:
- filename: a string representing the name of the file
returns: a matrix containing the elements in the given file
"""
url = codeskulptor.file2url(filename)
netfile = request.urlopen(url)
matrix = []
#Split each line in the file, convert the strings into decimal numbers,
#append the new row to the bottom of the matrix
for line in netfile.readlines():
strline = line.decode('utf-8')
data = strline.split(', ')
numdata = [float(num) for num in data]
matrix.append(numdata)
return numeric.Matrix(matrix)
class LinearModel:
"""
A class used to represent a Linear statistical
model of multiple variables. This model takes
a vector of input variables and predicts that
the measured variable will be their weighted sum.
"""
def __init__(self, weights):
"""
Create a new LinearModel.
inputs:
- weights: an m x 1 matrix of weights
"""
self._weights = weights
def __str__(self):
"""
Return: weights as a human readable string.
"""
return str(self._weights)
def get_weights(self):
"""
Return: the weights associated with the model.
"""
return self._weights
def generate_predictions(self, inputs):
"""
Use this model to predict a matrix of
measured variables given a matrix of input data.
inputs:
- inputs: an n x m matrix of explanatory variables
Returns: an n x 1 matrix of predictions
"""
return inputs @ self.get_weights()
def prediction_error(self, inputs, actual_result):
"""
Calculate the MSE between the actual measured
data and the predictions generated by this model
based on the input data.
inputs:
- inputs: inputs: an n x m matrix of explanatory variables
- actual_result: an n x 1 matrix of the corresponding
actual values for the measured variables
Returns: a float that is the MSE between the generated
data and the actual data
"""
total = 0.0
expected = self.generate_predictions(inputs)
size = actual_result.shape()
for index in range(size[0]):
total += ((actual_result[(index,0)]-expected[(index,0)])**2)
error = total / size[0]
return error
def fit_least_squares(input_data, output_data):
"""
Create a Linear Model which predicts the output vector
given the input matrix with minimal Mean-Squared Error.
inputs:
- input_data: an n x m matrix
- output_data: an n x 1 matrix
returns: a LinearModel object which has been fit to approximately
match the data
"""
#Calculate the weights matrix using the derivation in part 3.D.i. in recipe
product = input_data.transpose() @ input_data
inverse = product.inverse()
weights = (inverse @ input_data.transpose()) @ output_data
return LinearModel(weights)
def soft_threshold(data, dist):
"""
Use SoftThreshold to move data closer to 0 by distance
inputs:
- data: a decimal number representing the data to be moved
- dist: a decimal number representing the distance to move the data
returns: a decimal number representing the updated value of data after being moved
"""
if data > dist:
return data - dist
elif abs(data) <= dist:
return 0
else:
return data + dist
def fit_lasso(param, iterations, input_data, output_data):
"""
Create a Linear Model which predicts the output vector
given the input matrix using the LASSO method.
inputs:
- param: a float representing the lambda parameter
- iterations: an integer representing the number of iterations
- input_data: an n x m matrix
- output_data: an n x 1 matrix
returns: a LinearModel object which has been fit to approximately
match the data
"""
size = input_data.shape()
weights = (fit_least_squares(input_data, output_data)).get_weights()
iteration = 0
#For each iteration,'shoot' the initial guess of the weights matrix
#towards the minimum by iteratively making small changes to weights
while iteration < iterations:
weights_old = weights.copy()
for coord in range(size[1]):
#Calculate the values of a_j and b_j
prod1 = input_data.transpose() @ output_data
prod2 = input_data.transpose() @ input_data
prod3 = prod2.getrow(coord) @ weights
a_val = (prod1[(coord,0)] - prod3[(0,0)]) / prod2[(coord,coord)]
b_val = param / (2 * prod2[(coord,coord)])
weights[(coord,0)] = soft_threshold(weights[(coord,0)] + a_val, b_val)
#Calculate the complexity of the weights
diff_w = weights - weights_old
sum_w = (diff_w.abs()).summation()
if sum_w < 10**(-5):
return LinearModel(weights)
iteration += 1
return LinearModel(weights)
def run_experiment(iterations):
"""
Using some historical data from 1954-2000, as
training data, generate weights for a Linear Model
using both the Least-Squares method and the
LASSO method (with several different lambda values).
Test each of these models using the historical
data from 2001-2012 as test data.
inputs:
- iterations: an integer representing the number of iterations to use
Print out the model's prediction error on the two data sets
"""
#Read the matrices from the training and testing data files
train_stats = read_matrix("comp140_analytics_baseball.txt")
train_wins = read_matrix("comp140_analytics_wins.txt")
test_stats = read_matrix("comp140_analytics_baseball_test.txt")
test_wins = read_matrix("comp140_analytics_wins_test.txt")
#Create and fit a model to the 1954-2000 data
#using Least-Squares Estimation and LASSO Estimation
lse = fit_least_squares(train_stats, train_wins)
lasso1 = fit_lasso(1000, iterations, train_stats, train_wins)
lasso2 = fit_lasso(10000, iterations, train_stats, train_wins)
lasso3 = fit_lasso(100000, iterations, train_stats, train_wins)
sports.print_weights(lasso1)
sports.print_weights(lasso3)
#Print out each model's prediction error on the 1954-2000 data
print("Prediction error on the 1954-2000 data")
print("Least-Squares Estimation: ", lse.prediction_error(train_stats, train_wins))
print("LASSO Estimation with lambda parameter = 1000: ",
lasso1.prediction_error(train_stats, train_wins))
print("LASSO Estimation with lambda parameter = 10000: ",
lasso2.prediction_error(train_stats, train_wins))
print("LASSO Estimation with lambda parameter = 100000: ",
lasso3.prediction_error(train_stats, train_wins))
#Print out each model's prediction error on the 2001-2012 data
print("Prediction error on the 2001-2012 data")
print("Least-Squares Estimation: ", lse.prediction_error(test_stats, test_wins))
print("LASSO Estimation with lambda parameter = 1000: ",
lasso1.prediction_error(test_stats, test_wins))
print("LASSO Estimation with lambda parameter = 10000: ",
lasso2.prediction_error(test_stats, test_wins))
print("LASSO Estimation with lambda parameter = 100000: ",
lasso3.prediction_error(test_stats, test_wins))
#run_experiment(10)