-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix_factorization.py
144 lines (120 loc) · 5.32 KB
/
matrix_factorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from utils import *
from scipy.linalg import sqrtm
import numpy as np
def svd_reconstruct(matrix, k):
""" Given the matrix, perform singular value decomposition
to reconstruct the matrix.
:param matrix: 2D sparse matrix
:param k: int
:return: 2D matrix
"""
# First, you need to fill in the missing values (NaN) to perform SVD.
# Fill in the missing values using the average on the current item.
# Note that there are many options to do fill in the
# missing values (e.g. fill with 0).
new_matrix = matrix.copy()
mask = np.isnan(new_matrix)
masked_matrix = np.ma.masked_array(new_matrix, mask)
item_means = np.mean(masked_matrix, axis=0)
new_matrix = masked_matrix.filled(item_means)
# Next, compute the average and subtract it.
item_means = np.mean(new_matrix, axis=0)
mu = np.tile(item_means, (new_matrix.shape[0], 1))
new_matrix = new_matrix - mu
# Perform SVD.
Q, s, Ut = np.linalg.svd(new_matrix, full_matrices=False)
s = np.diag(s)
# Choose top k eigenvalues.
s = s[0:k, 0:k]
Q = Q[:, 0:k]
Ut = Ut[0:k, :]
s_root = sqrtm(s)
# Reconstruct the matrix.
reconst_matrix = np.dot(np.dot(Q, s_root), np.dot(s_root, Ut))
reconst_matrix = reconst_matrix + mu
return np.array(reconst_matrix)
def squared_error_loss(data, u, z):
""" Return the squared-error-loss given the data.
:param data: A dictionary {user_id: list, question_id: list,
is_correct: list}
:param u: 2D matrix
:param z: 2D matrix
:return: float
"""
loss = 0
for i, q in enumerate(data["question_id"]):
loss += (data["is_correct"][i]
- np.sum(u[data["user_id"][i]] * z[q])) ** 2.
return 0.5 * loss
def update_u_z(train_data, lr, u, z):
""" Return the updated U and Z after applying
stochastic gradient descent for matrix completion.
:param train_data: A dictionary {user_id: list, question_id: list,
is_correct: list}
:param lr: float
:param u: 2D matrix
:param z: 2D matrix
:return: (u, z)
"""
#####################################################################
# TODO: #
# Implement the function as described in the docstring. #
#####################################################################
# Randomly select a pair (user_id, question_id).
i = \
np.random.choice(len(train_data["question_id"]), 1)[0]
c = train_data["is_correct"][i]
n = train_data["user_id"][i]
q = train_data["question_id"][i]
#####################################################################
# END OF YOUR CODE #
#####################################################################
return u, z
def als(train_data, k, lr, num_iteration):
""" Performs ALS algorithm. Return reconstructed matrix.
:param train_data: A dictionary {user_id: list, question_id: list,
is_correct: list}
:param k: int
:param lr: float
:param num_iteration: int
:return: 2D reconstructed Matrix.
"""
# Initialize u and z
u = np.random.uniform(low=0, high=1 / np.sqrt(k),
size=(len(set(train_data["user_id"])), k))
z = np.random.uniform(low=0, high=1 / np.sqrt(k),
size=(len(set(train_data["question_id"])), k))
#####################################################################
# TODO: #
# Implement the function as described in the docstring. #
#####################################################################
mat = None
#####################################################################
# END OF YOUR CODE #
#####################################################################
return mat
def main():
train_matrix = load_train_sparse("../data").toarray()
train_data = load_train_csv("../data")
val_data = load_valid_csv("../data")
test_data = load_public_test_csv("../data")
#####################################################################
# TODO: #
# (SVD) Try out at least 5 different k and select the best k #
# using the validation set. #
#####################################################################
pass
#####################################################################
# END OF YOUR CODE #
#####################################################################
#####################################################################
# TODO: #
# (ALS) Try out at least 5 different k and select the best k #
# using the validation set. #
#####################################################################
pass
#####################################################################
# END OF YOUR CODE #
#####################################################################
if __name__ == "__main__":
main()