diff --git a/R/pam50_correlation_plot.R b/R/pam50_correlation_plot.R index eb33f89..56b8c57 100644 --- a/R/pam50_correlation_plot.R +++ b/R/pam50_correlation_plot.R @@ -40,7 +40,10 @@ multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { #' @param p50 The PAM50 results text file #' @param id The ID of the samples to extract #' @param name The name to represent the samples -cor_plot = function(p50,id,name){ +cor_plot = function(p,id,name){ + print(paste0("Generating correlation plots - ",id," - ",name)) + p$X=rownames(p) + print(head(p)) #get colors #myPal <- brewer.pal(6, "Spectral") myPal = c("red","green","blue","orange","purple","black") @@ -78,11 +81,12 @@ cor_plot = function(p50,id,name){ p$X <- factor(p$X, levels = p[order(p$Call), "X"]) fm=melt(p,id.vars=c("X","Call"),measure.vars=subtypes) + print(head(fm)) #set the colours for the ticks colorOrder = myPal[as.character(p[order(p$Call),]$Call)] p2=plotter("Call") - pdf(paste0(name,".pdf")) + pdf(paste0(inputDir,"/",name,"_cor_coef_plot.pdf")) multiplot(p1,p2,cols=2) dev.off() } diff --git a/R/subtypePrediction_distributed.R b/R/subtypePrediction_distributed.R index f5ac7bd..8d97de8 100755 --- a/R/subtypePrediction_distributed.R +++ b/R/subtypePrediction_distributed.R @@ -11,14 +11,13 @@ #8. Assign the class of the most highly correlated centroid to each sample pam50_wrapper=function(inputDir,inputFile,short){ - calibrationFile<- "./Data/mediansPerDataset_v2.txt" - trainCentroids<- "./Data/pam50_centroids.txt" - trainFile<<- "./Data/220arrays_nonUBCcommon+12normal_50g.txt" + calibrationFile<- "./pam50_data/mediansPerDataset_v2.txt" + trainCentroids<- "./pam50_data/pam50_centroids.txt" + trainFile<<- "./pam50_data/220arrays_nonUBCcommon+12normal_50g.txt" proliferationGenes<-c("CCNB1","UBE2C","BIRC5","KNTC2","CDC20","PTTG1","RRM2","MKI67","TYMS","CEP55","CDCA1") stdArray<-T # just for visualization, and only set to F if many missing genes predFiles<- paste(inputDir,inputFile,sep="/") - ### # some constants ### diff --git a/R/subtypePrediction_functions.R b/R/subtypePrediction_functions.R index 6c67950..177a759 100755 --- a/R/subtypePrediction_functions.R +++ b/R/subtypePrediction_functions.R @@ -26,13 +26,13 @@ medianCtr<-function(x){ } pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",startPC=1,stopPC=2,showNames=T,showClasses=F,axisExpansion=0,groupColors=NA){ - + features<- dim(x)[1] samples<- dim(x)[2] sampleNames<- dimnames(x)[[2]] featureNames<-dimnames(x)[[1]] x<-apply(x,2,as.numeric) - + #principal components plots data.pca<-prcomp(as.matrix(x)) @@ -55,15 +55,15 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st legendColors[k]<-k } } - + if(length(groupColors)==1){ gr.labels<-as.numeric(gr.labels) } - + #plot 2pcs by each other i<-startPC j<-stopPC - + #graphing parameters par(lab=c(3,4,3)) par(mgp=c(.3,.5,.0)) @@ -77,7 +77,7 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st xmax<-max(data.pca$rotation[,i])+abs(axisExpansion*max(data.pca$rotation[,i])) ymin<-min(data.pca$rotation[,j])-abs(axisExpansion*min(data.pca$rotation[,j])) ymax<-max(data.pca$rotation[,j])+abs(axisExpansion*max(data.pca$rotation[,j])) - plot(data.pca$rotation[,i],data.pca$rotation[,j], xlab=strX, ylab=strY, + plot(data.pca$rotation[,i],data.pca$rotation[,j], xlab=strX, ylab=strY, main=strM, col=gr.labels,cex=size,pch="",xlim=c(xmin,xmax),ylim=c(ymin,ymax)) if(showNames){ text(data.pca$rotation[,i],data.pca$rotation[,j],labels=names(data.pca$rotation[,i]),cex=size*.6) @@ -96,26 +96,26 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st cols <- function(lowi = "green", highi = "red", ncolors = 20) { low <- col2rgb(lowi)/255 high <- col2rgb("black")/255 - col1 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], + col1 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], high[2], len = ncolors), seq(low[3], high[3], len = ncolors)) low <- col2rgb("black")/255 high <- col2rgb(highi)/255 - col2 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], + col2 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], high[2], len = ncolors), seq(low[3], high[3], len = ncolors)) col<-c(col1[1:(ncolors-1)],col2) return(col) } - + myHeatmap<-function(x,t.colors=NA,fileName="cluster.cdt",linkage="complete",distance="pearson",contrast=2,returnSampleClust=F,rowNames=NA,rightMar=7,bottomMar=1,colNames=NA){ - + temp<-hclust2treeview(x,method=distance,file=fileName,link=linkage,keep.hclust=T) gTree<-temp[[1]] sTree<-temp[[2]] - + imageVals<-x imageVals[x > contrast] <- contrast imageVals[x < -1 * contrast] <- -1 * contrast - + if(sum(is.na(t.colors))>0){ heatmap(imageVals,Rowv=as.dendrogram(gTree),Colv=as.dendrogram(sTree), col=cols(),labCol=colNames, scale="none", @@ -165,26 +165,26 @@ collapseIDs<-function(x,allids=row.names(x),method="mean"){ ids<- levels(as.factor(allids)) x.col<- NULL - if(length(ids)==dim(x)[1]){ + if(length(ids)==dim(x)[1]){ dimnames(x)[[1]]<-allids - return(x) + return(x) } - + for(i in 1:length(ids)){ if(sum(allids==ids[i])>1){ - indices <- allids==ids[i] + indices <- allids==ids[i] if(method=="mean"){ vals<-apply(x[indices,],2,mean,na.rm=T) } if(method=="median"){ vals<-apply(x[indices,],2,median,na.rm=T) } - if(method=="stdev"){ + if(method=="stdev"){ temp<- x[indices,] stdevs<- apply(temp,1,sd,na.rm=T) vals<- temp[match(max(stdevs),stdevs),] } - if(method=="iqr"){ + if(method=="iqr"){ temp<- x[indices,] iqrs<- apply(temp,1,function(x){quantile(x,.75,na.rm=T)-quantile(x,.25,na.rm=T)}) vals<- temp[match(max(iqrs),iqrs),] @@ -197,7 +197,7 @@ collapseIDs<-function(x,allids=row.names(x),method="mean"){ dimnames(x.col)<- list(ids,dimnames(x)[[2]]) return(x.col) - + } @@ -214,11 +214,11 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){ ids<-x[,1] xd<-x[,-1] xd<-apply(xd,2,as.numeric) - xd<-collapseIDs(xd,ids,method) + xd<-collapseIDs(xd,ids,method) }else{ sampleNames<-as.vector(t(x[1,-1])) x<-x[-1,] - + classes<-x[1:(headerRows-1),] dimnames(classes)[[1]]<-classes[,1] classes<-classes[,-1] @@ -226,14 +226,14 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){ classes<-t(classes) rownames(classes)<-sampleNames classes<-as.data.frame(classes) - + xd<-x[(-1:-(headerRows-1)),] ids<-as.vector(t(xd[,1])) xd<-xd[,-1] xd<-apply(xd,2,as.numeric) xd<-collapseIDs(xd,ids,method) } - + features<- dim(xd)[1] samples<- dim(xd)[2] geneNames<-rownames(xd) @@ -249,7 +249,7 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){ x<-x[sort.list(rownames(x)),] classes<-as.data.frame(x) } - + if(sum(apply(xd,2,is.na))>0 & impute){ library(impute) allAnn<-dimnames(xd) @@ -257,7 +257,7 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){ xd<-data.imputed[1:features,] dimnames(xd)<-allAnn } - + return(list(xd=xd, classes=classes, nfeatures=features, nsamples=samples, fnames=geneNames, snames=sampleNames)) } @@ -269,26 +269,26 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid samples<- dim(x)[2] sampleNames<- dimnames(x)[[2]] featureNames<- dimnames(x)[[1]] - + #parse the test file - same as train file but no rows of classes tdataMatrix<-y tfeatures<- dim(y)[1] tsamples<- dim(y)[2] tsampleNames<- dimnames(y)[[2]] tfeatureNames<- dimnames(y)[[1]] - + #dimnames(tdataMatrix)[[2]]<-paste("x",seq(1,471)) temp <- overlapSets(dataMatrix,tdataMatrix) dataMatrix <- temp$x tdataMatrix <- temp$y sfeatureNames<-row.names(dataMatrix) - + # standardize both sets if(std){ dataMatrix<-standardize(dataMatrix) tdataMatrix<-standardize(tdataMatrix) } - + if(!centroids){ thisClass <- as.vector(classes[,1]) nClasses<-nlevels(as.factor(thisClass)) @@ -299,21 +299,21 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid thisClass<-as.numeric(thisClass) dataMatrix <- dataMatrix[,!(is.na(thisClass))] thisClass <- thisClass[!(is.na(thisClass))] - + scores<-apply(dataMatrix,1,bwss,thisClass) - trainscores<-vector() - for(j in 1:dim(dataMatrix)[1]){ + trainscores<-vector() + for(j in 1:dim(dataMatrix)[1]){ trainscores[j]<-scores[[row.names(dataMatrix)[j]]]$bss / scores[[row.names(dataMatrix)[j]]]$wss } - + dataMatrix<-dataMatrix[sort.list(trainscores,decreasing=T),] - tdataMatrix<-tdataMatrix[sort.list(trainscores,decreasing=T),] - + tdataMatrix<-tdataMatrix[sort.list(trainscores,decreasing=T),] + if(nGenes==""){ nGenes<-dim(dataMatrix)[1] } print(paste("Number of genes used:",nGenes)) - + dataMatrix<-dataMatrix[1:nGenes,] tdataMatrix<-tdataMatrix[1:nGenes,] @@ -322,7 +322,7 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid centroids[,j]<-apply(dataMatrix[,thisClass==j],1,mean) } dimnames(centroids)<-list(row.names(dataMatrix),NULL) - + }else{ nGenes<-dim(dataMatrix)[1] print(paste("Number of genes used:",nGenes)) @@ -330,7 +330,7 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid nClasses<-dim(centroids)[2] classLevels<-dimnames(centroids)[[2]] } - + distances<-matrix(ncol=nClasses,nrow=dim(tdataMatrix)[2]) for(j in 1:nClasses){ if(distm=="euclidean"){ @@ -343,14 +343,14 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid distances[,j]<-t(-1*cor(cbind(centroids[,j],tdataMatrix),method="spearman",use="pairwise.complete.obs"))[2:(dim(tdataMatrix)[2]+1)] } } - + scores<-apply(distances,1,min) prediction<-vector(length=tsamples) for(i in 1:tsamples){ prediction[i]<-classLevels[match(scores[i],distances[i,])] } names(prediction)<-tsampleNames - + return(list(predictions=prediction,testData=tdataMatrix,distances=distances,centroids=centroids)) } diff --git a/R/subtype_functions.R b/R/subtype_functions.R index ec8607a..42bf347 100644 --- a/R/subtype_functions.R +++ b/R/subtype_functions.R @@ -12,6 +12,7 @@ #installifnot("gplots") #installifnot("heatmap.plus") #installifnot("GMD") +#installifnot("reshape") #biocIfnot("limma") #biocIfnot("org.Hs.eg.db") #biocIfnot("genefu") @@ -23,6 +24,7 @@ library(gplots) library(org.Hs.eg.db) library(genefu) library(heatmap.plus) +library(reshape) #library(GMD) #################################################################### @@ -367,7 +369,7 @@ run_p50=function(){ dev.off() #plot the correlation coefficients - cor_plot(pam.res,".","Samples") + cor_plot(pam.res,".",short) } ################# SCMGENE ####################### diff --git a/Data/220arrays_nonUBCcommon+12normal_50g.txt b/pam50_data/220arrays_nonUBCcommon+12normal_50g.txt similarity index 100% rename from Data/220arrays_nonUBCcommon+12normal_50g.txt rename to pam50_data/220arrays_nonUBCcommon+12normal_50g.txt diff --git a/Data/mediansPerDataset_v2.txt b/pam50_data/mediansPerDataset_v2.txt similarity index 100% rename from Data/mediansPerDataset_v2.txt rename to pam50_data/mediansPerDataset_v2.txt diff --git a/Data/pam50_annotation.txt b/pam50_data/pam50_annotation.txt similarity index 100% rename from Data/pam50_annotation.txt rename to pam50_data/pam50_annotation.txt diff --git a/Data/pam50_centroids.txt b/pam50_data/pam50_centroids.txt similarity index 100% rename from Data/pam50_centroids.txt rename to pam50_data/pam50_centroids.txt