-
Notifications
You must be signed in to change notification settings - Fork 1
/
problem2.py
153 lines (118 loc) · 5.72 KB
/
problem2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
def reduced_data_p2(data):
all_users = [i[0] for i in data]
all_users_sets = set(all_users)
count_videos = dict.fromkeys(all_users_sets, 0)
for user in data:
count_videos[user[0]] += 1
delete_users = []
for key, value in count_videos.items():
if value < 47: # 47 is the threshold for at least half of your quizzes
delete_users.append(key)
for key in delete_users:
del count_videos[key]
return count_videos # Total of 94 elements
def obtain_regression_data_p2(data, valid_users):
listOfPoints = []
userID = 0 # 1 user id (key)
fracSpent = 2
fracComp = 3
fracPaused = 5
numPauses = 6
avgPBR = 7
numRWs = 9
numFFs = 10 # 7 features
avg_score = 11 # 1 score (output)
all_features = dict.fromkeys(valid_users.keys(), []) # create a dictionary where the keys are the same as the valid users and values are empty lists
for row in data:
if row[0] in valid_users:
attribute_list = [float(row[fracSpent]), float(row[fracComp]), float(row[fracPaused]), float(row[numPauses]), float(row[avgPBR]), float(row[numRWs]), float(row[numFFs]), float(row[avg_score])] # current attributes in data table
current_list = all_features[row[userID]]
if len(current_list) == 0: # first userID that I encounter, simply append that list
all_features[row[userID]] = all_features[row[userID]] + attribute_list # Attribute_list will be the new list
else: # otherwise, add each element to that list
for i in range(len(current_list)):
current_list[i] += attribute_list[i]
for key, value in all_features.items():
num_vids = valid_users[key] # number of videos for a student
total_list = [i / num_vids for i in value] # divide by the total number of videos
listOfPoints.append(total_list) # add this to the list of points for the clustering algorithm
X = [point[0:7] for point in listOfPoints]
y = [point[-1] for point in listOfPoints]
return X,y
def train_test_split(X,y):
X_train = []
y_train = []
X_test = []
y_test = []
listOftrainSlices = [[slice(19,94)], [slice(0,19),slice(38,94)], [slice(0,38),slice(57,94)], [slice(0,57),slice(76,94)], [slice(0,76)]]
listOftestSlices = [slice(0,19), slice(19,38), slice(38,57), slice(57,76), slice(76,94)] # remember that the final index must be 93, not 94
for i in range(len(listOftestSlices)):
train_slice = listOftrainSlices[i]
test_slice = listOftestSlices[i]
if (len(train_slice) == 1):
X_train.append(X[train_slice[0]])
y_train.append(y[train_slice[0]])
else:
X_train.append(X[train_slice[0]] + X[train_slice[1]])
y_train.append(y[train_slice[0]] + y[train_slice[1]])
X_test.append(X[test_slice])
y_test.append(y[test_slice])
return X_train, y_train, X_test, y_test
def Ind_MSE(X_train, y_train, X_test, y_test, l):
mean_X = np.mean(X_train, axis=0) # normalize the data
std_X = np.std(X_train, axis=0)
mean_y = np.mean(y_train)
std_y = np.std(y_train)
X_train_norm = (X_train - mean_X) / std_X
y_train_norm = (y_train - mean_y) / std_y
X_test_norm = (X_test - mean_X) / std_X
regr = Ridge(alpha=l, fit_intercept=True)
regr.fit(X_train_norm,y_train_norm) # fit the model with normalized data
y_pred_train_norm = regr.predict(X_train_norm)
y_pred_train = (y_pred_train_norm * std_y) + mean_y
y_pred_test_norm = regr.predict(X_test_norm) # predict the data for the testing data
y_pred_test = (y_pred_test_norm * std_y) + mean_y # Unnormalize y after prediction
train_MSE = mean_squared_error(y_train, y_pred_train)
test_MSE = mean_squared_error(y_test, y_pred_test)
return train_MSE, test_MSE
def ObtainMSEs(X_train, y_train, X_test, y_test, l):
train_MSE = []
test_MSE = []
for i in range(len(X_train)):
train_MSE_ind, test_MSE_ind = Ind_MSE(X_train[i], y_train[i], X_test[i], y_test[i], l)
train_MSE.append(train_MSE_ind)
test_MSE.append(test_MSE_ind)
return sum(train_MSE) / len(train_MSE), sum(test_MSE) / len(test_MSE)
def Linear_Regression(X, y):
total_train_MSE = []
total_test_MSE = []
lambda_range = np.logspace(-1, 2.5, num=351)
X_train, y_train, X_test, y_test = train_test_split(X, y) # return the split training and testing data where each is a list of five-fold
for l in lambda_range:
train_MSE_ind, test_MSE_ind = ObtainMSEs(X_train, y_train, X_test, y_test, l)
total_train_MSE.append(train_MSE_ind)
total_test_MSE.append(test_MSE_ind)
# print(train_MSE_ind)
# print(test_MSE_ind)
fig, axs = plt.subplots(2)
fig.suptitle("Comparison of MSE")
axs[0].plot(lambda_range, total_train_MSE)
axs[0].set_title("Average MSE for training data")
axs[1].plot(lambda_range, total_test_MSE)
axs[1].set_title("Average MSE for testing data")
axs[1].set(xlabel="Lambda Values",ylabel="Mean Squared Error")
for ax in axs.flat:
ax.label_outer()
plt.show()
min_MSE_index = np.argmin(total_test_MSE)
min_lambda_value = lambda_range[min_MSE_index]
print("Minimum testing MSE: ", min(total_test_MSE))
print("Corresponding lambda value: ", min_lambda_value)
def problem2(data):
valid_users = reduced_data_p2(data) # reduce the data to only students who completed at least half of the quizzes
X, y = obtain_regression_data_p2(data, valid_users) # obtain the features and output data for the linear regression problem
Linear_Regression(X, y)