-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
edtion1.0
- Loading branch information
stxupengyu
authored
Aug 7, 2020
1 parent
edd4ba7
commit 197d90a
Showing
4 changed files
with
46,356 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,269 @@ | ||
|
||
|
||
```{r} | ||
#参数设置 | ||
rm(list=ls()) | ||
epsilon=50 # Learning rate 学习率 | ||
lambda = 0.1#Regularization parameter正则化参数 | ||
momentum=0.7#动量优化参数 | ||
epoch=1#初始化epoch | ||
maxepoch=10#总训练次数 | ||
err_train=rep(0,maxepoch) | ||
err_valid=rep(0,maxepoch) | ||
err_random=rep(0,maxepoch) | ||
num_feat = 10 #Rank 10 decomposition 隐因子数量 | ||
``` | ||
|
||
```{r} | ||
#导入数据 | ||
data=read.csv("training.txt", header = FALSE,sep=' ') | ||
data[,3]=5 | ||
rating=data[,3] | ||
movie=data[,2] | ||
user=data[,1] | ||
data_num=length(rating) | ||
movie_num= max(movie)#电影数量1682 | ||
user_num=max(user)#用户数量943 | ||
negative_sample=sample(1:user_num*movie_num,data_num,replace=TRUE) | ||
#负采样 同样数量的负样本 | ||
record_user=rep(0,0) | ||
record_movie=rep(0,0) | ||
record_rating=rep(1,data_num) | ||
for (i in negative_sample){ | ||
movie_i=i%/%user_num | ||
user_i=i%%user_num+1 | ||
record_user=append(record_user,user_i) | ||
record_movie=append(record_movie,movie_i) | ||
} | ||
negative_data=data.frame(V1=record_user, V2=record_movie, V3=record_rating) | ||
``` | ||
|
||
|
||
```{r} | ||
#融合正样本和负样本 | ||
data=rbind(data,negative_data) | ||
#data=rbind(data,data) | ||
# | ||
rating=data[,3] | ||
movie=data[,2] | ||
user=data[,1] | ||
data_num=length(rating) | ||
movie_num= max(movie)#电影数量1682 | ||
user_num=max(user)#用户数量943 | ||
#打乱数据集 | ||
rr=sample(1:data_num,data_num) | ||
data= data[rr,] | ||
``` | ||
|
||
|
||
```{r} | ||
data_num/(movie_num*user_num)#稀疏度 | ||
``` | ||
|
||
```{r} | ||
#划分训练集和测试集 | ||
train_num=80000#(data_num*9)%/%10; | ||
train_vec=data[1:train_num,];#训练集 | ||
probe_vec=data[train_num:84000,];#测试集 | ||
mean_rating = mean(train_vec[,3])#平均评级 | ||
pairs_tr = length((train_vec[,3]))#training data 训练集长度 | ||
pairs_pr = length((probe_vec[,3]))#validation data 验证集长度 | ||
numbatches= 8 #Number of batches把数据分为8份 | ||
num_m = movie_num # Number of movies 电影数量 | ||
num_p = user_num #Number of users 用户数量 | ||
``` | ||
|
||
```{r} | ||
#初始化 | ||
w1_M1 = 0.1*matrix(runif(num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T) # Movie feature vectors 生成用户物品特征矩阵d=10 | ||
w1_P1 = 0.1*matrix(runif(num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T) # User feature vecators | ||
w1_M1_inc = matrix(rep(0,num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T)#生成同shape的全零矩阵 | ||
w1_P1_inc = matrix(rep(0,num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T) | ||
``` | ||
|
||
```{r} | ||
for(epoch in epoch:maxepoch){ | ||
#采用mini batch的方法,每次训练10000个样本 | ||
for(batch in 1:numbatches){ | ||
#print(c('epoch',epoch,'batch',batch)) | ||
N=10000 #number training triplets per batch 每次训练三元组的数量 | ||
aa_p= train_vec[((batch-1)*N+1):(batch*N),1]#读取用户列每次读取一万个 | ||
aa_m= train_vec[((batch-1)*N+1):(batch*N),2]#读取电影列 | ||
rating = train_vec[((batch-1)*N+1):(batch*N),3]#读取评级列 | ||
rating = rating-mean_rating; #Default prediction is the mean rating. | ||
# Compute Predictions %%%%%%%%%%%%%%%%% | ||
pred_out= apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum)#每一行进行求和,.*为对应元素相乘 10k*1,用隐特征矩阵相乘得到1w个预测评级 | ||
f=sum((pred_out - rating)^2) | ||
#求出损失函数 | ||
#其实之所以这样计算是为了降低计算量,如果每一次都uv相乘,计算量太大 | ||
#Compute Gradients %%%%%%%%%%%%%%%%%%%迭代 | ||
IO =2*(pred_out - rating) | ||
kkkk=num_feat-1 | ||
for(kkk in 1:kkkk){ | ||
IO =cbind(IO,2*(pred_out - rating)) | ||
} | ||
#IO = repmat(2*(pred_out - rating),1,num_feat) | ||
#将损失矩阵的二倍,复制3列 | ||
Ix_m=IO*w1_P1[aa_p,] #损失*U-lambda*V 就是更新规则 | ||
Ix_p=IO*w1_M1[aa_m,] #损失*V-lambda*U | ||
#还是不太懂他为啥能batch后这么计算 而且矩阵形式本来就更复杂 | ||
dw1_M1 = matrix(rep(0,num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T) | ||
#生成全零movie特征矩阵 | ||
dw1_P1 = matrix(rep(0,num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T)#生成全零用户特征矩阵 | ||
for(ii in 1:N){#迭代一万次 每一行一行来 得到更新矩阵 | ||
dw1_M1[aa_m[ii],]= dw1_M1[aa_m[ii],] + Ix_m[ii,] | ||
dw1_P1[aa_p[ii],]= dw1_P1[aa_p[ii],] + Ix_p[ii,] | ||
} | ||
# Update movie and user features %%%%%%%%%%% | ||
#真正开始更新 新的矩阵=过去矩阵+学习率*导数 | ||
#全零特征矩阵=上次的得到的矩阵*动量+学习率*导数/1w | ||
w1_M1_inc = momentum*w1_M1_inc + epsilon*dw1_M1/N | ||
w1_M1 = w1_M1 - w1_M1_inc#原矩阵-负导数*学习率 | ||
w1_P1_inc = momentum*w1_P1_inc + epsilon*dw1_P1/N | ||
w1_P1 = w1_P1 - w1_P1_inc | ||
} | ||
#此时所有(9轮)batch结束 | ||
#现在已经得到了此轮epoch后的一组U和V | ||
# Compute Predictions after Paramete Updates %%%%%%%%%%%%%%%%% | ||
pred_out= apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum) | ||
f_s=sum((pred_out - rating)^2 ) | ||
err_train[epoch] = sqrt(f_s/N) | ||
#Compute predictions on the validation set %%%%%%%%%%%%%%%%%%%%%% | ||
NN=pairs_pr#验证集长度 | ||
aa_p = probe_vec[,1]#读取验证集的user、movie、rating | ||
aa_m = probe_vec[,2] | ||
rating = probe_vec[,3] | ||
pred_out =apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum) + mean_rating#预测结果加上mean才行 | ||
pred_out[pred_out>5]=5 | ||
pred_out[pred_out<1]=1 | ||
#使得预测结果超过评分区间的值依旧掉在区间内 | ||
err_valid[epoch]= sqrt(sum((pred_out- rating)^2)/NN) | ||
print(paste('epoch',epoch,'Train RMSE',signif(err_train[epoch], 4),'Test RMSE',signif(err_valid[epoch], 4))) | ||
################################输出到屏幕 | ||
# err_random[epoch]=sqrt(sum((runif(10001,1,5) - rating)^2)/NN) | ||
} | ||
``` | ||
|
||
|
||
|
||
```{r} | ||
#画出迭代的损失函数图 | ||
err_random=sqrt(sum((runif(10001,1,5) - rating)^2)/NN) | ||
plot(err_train,type="l",col=4,lwd=4,main="Loss",xlab="epoch",ylab="Loss") | ||
lines(err_valid,type="l",col=3,lwd=3) | ||
#lines(err_random-1,type="l",col=2,lwd=2) | ||
abline(h=err_random,col=2,lwd=2) | ||
legend("topright",c("train","test","baseline"),lty=1,col=c(4,3,2),lwd=c(4,3,2)) | ||
``` | ||
|
||
```{r} | ||
#展示一下预测和真实的对比 | ||
gap=40:50 | ||
rbind (rating[gap],pred_out[gap]) | ||
``` | ||
|
||
|
||
```{r} | ||
# TOP-N 推荐 | ||
#输出要为用户i推荐的j部电影 及其预测评分 | ||
i=5 | ||
j=6 | ||
user_i_rating=w1_P1[i,]%*%t(w1_M1)+mean_rating | ||
used=data[data$V1==i,2] | ||
user_i_rating[used]=0 | ||
order(user_i_rating,decreasing=TRUE)[1:j] | ||
user_i_rating[order(user_i_rating,decreasing=TRUE)[1:j]] | ||
``` | ||
|
||
```{r} | ||
#计算Pre@10 Re@10 其中实际评分大于4的,我们认为是需要推荐的物品 | ||
# TOP-N 推荐 | ||
j=10 | ||
txt=numeric(user_num) | ||
for(i in 1:user_num){ | ||
user_i_rating_real=probe_vec[(probe_vec$V1==i),] | ||
user_i_rating_real=user_i_rating_real[order(user_i_rating_real$V3,decreasing = TRUE),] | ||
user_i_rating=w1_P1[i,]%*%t(w1_M1[user_i_rating_real$V2,])+mean_rating | ||
if(length(user_i_rating_real$V2)>j){ | ||
txt[i]=sum(rank(-1*user_i_rating)[1:j]<=j)/j | ||
} | ||
if(length(user_i_rating_real$V2)<=j){ | ||
ti=sum(user_i_rating_real$V3>=4) | ||
if(ti!=0){ | ||
txt[i]=sum(user_i_rating[1:ti]>=4)/ti | ||
} | ||
} | ||
} | ||
Pre=mean(txt) | ||
txt=numeric(user_num) | ||
for(i in 1:user_num){ | ||
user_i_rating_real=probe_vec[(probe_vec$V1==i),] | ||
user_i_rating_real=user_i_rating_real[order(user_i_rating_real$V3,decreasing = TRUE),] | ||
user_i_rating=w1_P1[i,]%*%t(w1_M1[user_i_rating_real$V2,])+mean_rating | ||
if(length(user_i_rating_real$V2)>j){ | ||
user_i_rating[user_i_rating<4]=0 | ||
user_i_rating[user_i_rating>4]=1 | ||
ti=sum(user_i_rating) | ||
if(ti!=0){ | ||
bigerthan4=sum(user_i_rating_real$V3>=4) | ||
tinri=sum(user_i_rating[1:bigerthan4]) | ||
txt[i]=tinri/ti | ||
} | ||
} | ||
if(length(user_i_rating_real$V2)<=j){ | ||
ti=sum(user_i_rating_real$V3>=4) | ||
if(ti!=0){ | ||
txt[i]=sum(user_i_rating[1:ti]>=4)/ti | ||
} | ||
} | ||
} | ||
Re=mean(txt) | ||
print(paste('Re',signif(Re, 4),'Pre',signif(Pre, 4))) | ||
``` | ||
|
||
```{r} | ||
#在新样本上进行预测 | ||
test=read.csv("test.txt", header = FALSE,sep=' ') | ||
# 读取test中的用户 一一进行top10推荐 | ||
j=10 | ||
record=rep(0,0) | ||
for(i in test$V1){ | ||
user_i_rating=w1_P1[i,]%*%t(w1_M1)+mean_rating | ||
used=data[data$V1==i,2] | ||
user_i_rating[used]=0 | ||
record=rbind(record,order(user_i_rating,decreasing=TRUE)[1:j]) | ||
} | ||
final_result=cbind(test,data.frame(record)) | ||
write.csv(final_result,file = 'result.csv') | ||
``` | ||
|
Oops, something went wrong.