Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
edtion1.0
  • Loading branch information
stxupengyu authored Aug 7, 2020
1 parent edd4ba7 commit 197d90a
Show file tree
Hide file tree
Showing 4 changed files with 46,356 additions and 0 deletions.
269 changes: 269 additions & 0 deletions implicit-feedback.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,269 @@


```{r}
#参数设置
rm(list=ls())
epsilon=50 # Learning rate 学习率
lambda = 0.1#Regularization parameter正则化参数
momentum=0.7#动量优化参数
epoch=1#初始化epoch
maxepoch=10#总训练次数
err_train=rep(0,maxepoch)
err_valid=rep(0,maxepoch)
err_random=rep(0,maxepoch)
num_feat = 10 #Rank 10 decomposition 隐因子数量
```

```{r}
#导入数据
data=read.csv("training.txt", header = FALSE,sep=' ')
data[,3]=5
rating=data[,3]
movie=data[,2]
user=data[,1]
data_num=length(rating)
movie_num= max(movie)#电影数量1682
user_num=max(user)#用户数量943
negative_sample=sample(1:user_num*movie_num,data_num,replace=TRUE)
#负采样 同样数量的负样本
record_user=rep(0,0)
record_movie=rep(0,0)
record_rating=rep(1,data_num)
for (i in negative_sample){
movie_i=i%/%user_num
user_i=i%%user_num+1
record_user=append(record_user,user_i)
record_movie=append(record_movie,movie_i)
}
negative_data=data.frame(V1=record_user, V2=record_movie, V3=record_rating)
```


```{r}
#融合正样本和负样本
data=rbind(data,negative_data)
#data=rbind(data,data)
#
rating=data[,3]
movie=data[,2]
user=data[,1]
data_num=length(rating)
movie_num= max(movie)#电影数量1682
user_num=max(user)#用户数量943
#打乱数据集
rr=sample(1:data_num,data_num)
data= data[rr,]
```


```{r}
data_num/(movie_num*user_num)#稀疏度
```

```{r}
#划分训练集和测试集
train_num=80000#(data_num*9)%/%10;
train_vec=data[1:train_num,];#训练集
probe_vec=data[train_num:84000,];#测试集
mean_rating = mean(train_vec[,3])#平均评级
pairs_tr = length((train_vec[,3]))#training data 训练集长度
pairs_pr = length((probe_vec[,3]))#validation data 验证集长度
numbatches= 8 #Number of batches把数据分为8份
num_m = movie_num # Number of movies 电影数量
num_p = user_num #Number of users 用户数量
```

```{r}
#初始化
w1_M1 = 0.1*matrix(runif(num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T) # Movie feature vectors 生成用户物品特征矩阵d=10
w1_P1 = 0.1*matrix(runif(num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T) # User feature vecators
w1_M1_inc = matrix(rep(0,num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T)#生成同shape的全零矩阵
w1_P1_inc = matrix(rep(0,num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T)
```

```{r}
for(epoch in epoch:maxepoch){
#采用mini batch的方法,每次训练10000个样本
for(batch in 1:numbatches){
#print(c('epoch',epoch,'batch',batch))
N=10000 #number training triplets per batch 每次训练三元组的数量
aa_p= train_vec[((batch-1)*N+1):(batch*N),1]#读取用户列每次读取一万个
aa_m= train_vec[((batch-1)*N+1):(batch*N),2]#读取电影列
rating = train_vec[((batch-1)*N+1):(batch*N),3]#读取评级列
rating = rating-mean_rating; #Default prediction is the mean rating.
# Compute Predictions %%%%%%%%%%%%%%%%%
pred_out= apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum)#每一行进行求和,.*为对应元素相乘 10k*1,用隐特征矩阵相乘得到1w个预测评级
f=sum((pred_out - rating)^2)
#求出损失函数
#其实之所以这样计算是为了降低计算量,如果每一次都uv相乘,计算量太大
#Compute Gradients %%%%%%%%%%%%%%%%%%%迭代
IO =2*(pred_out - rating)
kkkk=num_feat-1
for(kkk in 1:kkkk){
IO =cbind(IO,2*(pred_out - rating))
}
#IO = repmat(2*(pred_out - rating),1,num_feat)
#将损失矩阵的二倍,复制3列
Ix_m=IO*w1_P1[aa_p,] #损失*U-lambda*V 就是更新规则
Ix_p=IO*w1_M1[aa_m,] #损失*V-lambda*U
#还是不太懂他为啥能batch后这么计算 而且矩阵形式本来就更复杂
dw1_M1 = matrix(rep(0,num_m*num_feat),nrow = num_m , ncol = num_feat ,byrow =T)
#生成全零movie特征矩阵
dw1_P1 = matrix(rep(0,num_p*num_feat),nrow = num_p , ncol = num_feat ,byrow =T)#生成全零用户特征矩阵
for(ii in 1:N){#迭代一万次 每一行一行来 得到更新矩阵
dw1_M1[aa_m[ii],]= dw1_M1[aa_m[ii],] + Ix_m[ii,]
dw1_P1[aa_p[ii],]= dw1_P1[aa_p[ii],] + Ix_p[ii,]
}
# Update movie and user features %%%%%%%%%%%
#真正开始更新 新的矩阵=过去矩阵+学习率*导数
#全零特征矩阵=上次的得到的矩阵*动量+学习率*导数/1w
w1_M1_inc = momentum*w1_M1_inc + epsilon*dw1_M1/N
w1_M1 = w1_M1 - w1_M1_inc#原矩阵-负导数*学习率
w1_P1_inc = momentum*w1_P1_inc + epsilon*dw1_P1/N
w1_P1 = w1_P1 - w1_P1_inc
}
#此时所有(9轮)batch结束
#现在已经得到了此轮epoch后的一组U和V
# Compute Predictions after Paramete Updates %%%%%%%%%%%%%%%%%
pred_out= apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum)
f_s=sum((pred_out - rating)^2 )
err_train[epoch] = sqrt(f_s/N)
#Compute predictions on the validation set %%%%%%%%%%%%%%%%%%%%%%
NN=pairs_pr#验证集长度
aa_p = probe_vec[,1]#读取验证集的user、movie、rating
aa_m = probe_vec[,2]
rating = probe_vec[,3]
pred_out =apply(w1_M1[aa_m,]*w1_P1[aa_p,],1,sum) + mean_rating#预测结果加上mean才行
pred_out[pred_out>5]=5
pred_out[pred_out<1]=1
#使得预测结果超过评分区间的值依旧掉在区间内
err_valid[epoch]= sqrt(sum((pred_out- rating)^2)/NN)
print(paste('epoch',epoch,'Train RMSE',signif(err_train[epoch], 4),'Test RMSE',signif(err_valid[epoch], 4)))
################################输出到屏幕
# err_random[epoch]=sqrt(sum((runif(10001,1,5) - rating)^2)/NN)
}
```



```{r}
#画出迭代的损失函数图
err_random=sqrt(sum((runif(10001,1,5) - rating)^2)/NN)
plot(err_train,type="l",col=4,lwd=4,main="Loss",xlab="epoch",ylab="Loss")
lines(err_valid,type="l",col=3,lwd=3)
#lines(err_random-1,type="l",col=2,lwd=2)
abline(h=err_random,col=2,lwd=2)
legend("topright",c("train","test","baseline"),lty=1,col=c(4,3,2),lwd=c(4,3,2))
```

```{r}
#展示一下预测和真实的对比
gap=40:50
rbind (rating[gap],pred_out[gap])
```


```{r}
# TOP-N 推荐
#输出要为用户i推荐的j部电影 及其预测评分
i=5
j=6
user_i_rating=w1_P1[i,]%*%t(w1_M1)+mean_rating
used=data[data$V1==i,2]
user_i_rating[used]=0
order(user_i_rating,decreasing=TRUE)[1:j]
user_i_rating[order(user_i_rating,decreasing=TRUE)[1:j]]
```

```{r}
#计算Pre@10 Re@10 其中实际评分大于4的,我们认为是需要推荐的物品
# TOP-N 推荐
j=10
txt=numeric(user_num)
for(i in 1:user_num){
user_i_rating_real=probe_vec[(probe_vec$V1==i),]
user_i_rating_real=user_i_rating_real[order(user_i_rating_real$V3,decreasing = TRUE),]
user_i_rating=w1_P1[i,]%*%t(w1_M1[user_i_rating_real$V2,])+mean_rating
if(length(user_i_rating_real$V2)>j){
txt[i]=sum(rank(-1*user_i_rating)[1:j]<=j)/j
}
if(length(user_i_rating_real$V2)<=j){
ti=sum(user_i_rating_real$V3>=4)
if(ti!=0){
txt[i]=sum(user_i_rating[1:ti]>=4)/ti
}
}
}
Pre=mean(txt)
txt=numeric(user_num)
for(i in 1:user_num){
user_i_rating_real=probe_vec[(probe_vec$V1==i),]
user_i_rating_real=user_i_rating_real[order(user_i_rating_real$V3,decreasing = TRUE),]
user_i_rating=w1_P1[i,]%*%t(w1_M1[user_i_rating_real$V2,])+mean_rating
if(length(user_i_rating_real$V2)>j){
user_i_rating[user_i_rating<4]=0
user_i_rating[user_i_rating>4]=1
ti=sum(user_i_rating)
if(ti!=0){
bigerthan4=sum(user_i_rating_real$V3>=4)
tinri=sum(user_i_rating[1:bigerthan4])
txt[i]=tinri/ti
}
}
if(length(user_i_rating_real$V2)<=j){
ti=sum(user_i_rating_real$V3>=4)
if(ti!=0){
txt[i]=sum(user_i_rating[1:ti]>=4)/ti
}
}
}
Re=mean(txt)
print(paste('Re',signif(Re, 4),'Pre',signif(Pre, 4)))
```

```{r}
#在新样本上进行预测
test=read.csv("test.txt", header = FALSE,sep=' ')
# 读取test中的用户 一一进行top10推荐
j=10
record=rep(0,0)
for(i in test$V1){
user_i_rating=w1_P1[i,]%*%t(w1_M1)+mean_rating
used=data[data$V1==i,2]
user_i_rating[used]=0
record=rbind(record,order(user_i_rating,decreasing=TRUE)[1:j])
}
final_result=cbind(test,data.frame(record))
write.csv(final_result,file = 'result.csv')
```

Loading

0 comments on commit 197d90a

Please sign in to comment.