renamed data dir to avoid issues and added correlation coeffiecient p…

…lots
elswob · Jul 9, 2015 · 91d00ec · 91d00ec
1 parent 8dc155c
commit 91d00ec
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 47 deletions.
diff --git a/R/pam50_correlation_plot.R b/R/pam50_correlation_plot.R
@@ -40,7 +40,10 @@ multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
 #' @param p50 The PAM50 results text file
 #' @param id The ID of the samples to extract
 #' @param name The name to represent the samples
-cor_plot = function(p50,id,name){
+cor_plot = function(p,id,name){
+  print(paste0("Generating correlation plots - ",id," - ",name))
+  p$X=rownames(p)
+  print(head(p))
   #get colors
   #myPal <- brewer.pal(6, "Spectral")
   myPal = c("red","green","blue","orange","purple","black")
@@ -78,11 +81,12 @@ cor_plot = function(p50,id,name){
 
   p$X <- factor(p$X, levels = p[order(p$Call), "X"])
   fm=melt(p,id.vars=c("X","Call"),measure.vars=subtypes)
+  print(head(fm))
   #set the colours for the ticks
   colorOrder = myPal[as.character(p[order(p$Call),]$Call)]
   p2=plotter("Call")
 
-  pdf(paste0(name,".pdf"))
+  pdf(paste0(inputDir,"/",name,"_cor_coef_plot.pdf"))
   multiplot(p1,p2,cols=2)
   dev.off()
 }

diff --git a/R/subtypePrediction_distributed.R b/R/subtypePrediction_distributed.R
@@ -11,14 +11,13 @@
 #8. Assign the class of the most highly correlated centroid to each sample
 
 pam50_wrapper=function(inputDir,inputFile,short){
-  calibrationFile<- "./Data/mediansPerDataset_v2.txt"
-  trainCentroids<- "./Data/pam50_centroids.txt"
-  trainFile<<- "./Data/220arrays_nonUBCcommon+12normal_50g.txt"
+  calibrationFile<- "./pam50_data/mediansPerDataset_v2.txt"
+  trainCentroids<- "./pam50_data/pam50_centroids.txt"
+  trainFile<<- "./pam50_data/220arrays_nonUBCcommon+12normal_50g.txt"
   proliferationGenes<-c("CCNB1","UBE2C","BIRC5","KNTC2","CDC20","PTTG1","RRM2","MKI67","TYMS","CEP55","CDCA1")
   stdArray<-T # just for visualization, and only set to F if many missing genes
   predFiles<- paste(inputDir,inputFile,sep="/")
 
-
   ###
   # some constants
   ###

diff --git a/R/subtypePrediction_functions.R b/R/subtypePrediction_functions.R
@@ -26,13 +26,13 @@ medianCtr<-function(x){
 }
 
 pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",startPC=1,stopPC=2,showNames=T,showClasses=F,axisExpansion=0,groupColors=NA){
-	
+
 	features<- dim(x)[1]
 	samples<- dim(x)[2]
 	sampleNames<- dimnames(x)[[2]]
 	featureNames<-dimnames(x)[[1]]
 	x<-apply(x,2,as.numeric)
-	
+
 	#principal components plots
 	data.pca<-prcomp(as.matrix(x))
 
@@ -55,15 +55,15 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st
 				legendColors[k]<-k
 		}
 	}
-	
+
 	if(length(groupColors)==1){
 		gr.labels<-as.numeric(gr.labels)
 	}
-	
+
 	#plot 2pcs by each other
 	i<-startPC
 	j<-stopPC
-	
+
 	#graphing parameters
 	par(lab=c(3,4,3))
  	par(mgp=c(.3,.5,.0))
@@ -77,7 +77,7 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st
 	xmax<-max(data.pca$rotation[,i])+abs(axisExpansion*max(data.pca$rotation[,i]))
 	ymin<-min(data.pca$rotation[,j])-abs(axisExpansion*min(data.pca$rotation[,j]))
 	ymax<-max(data.pca$rotation[,j])+abs(axisExpansion*max(data.pca$rotation[,j]))
-	plot(data.pca$rotation[,i],data.pca$rotation[,j], xlab=strX, ylab=strY, 
+	plot(data.pca$rotation[,i],data.pca$rotation[,j], xlab=strX, ylab=strY,
 							main=strM, col=gr.labels,cex=size,pch="",xlim=c(xmin,xmax),ylim=c(ymin,ymax))
 	if(showNames){
 		text(data.pca$rotation[,i],data.pca$rotation[,j],labels=names(data.pca$rotation[,i]),cex=size*.6)
@@ -96,26 +96,26 @@ pcaEA<-function(x,classes,size=1,showLegend=T,legendloc="topright",mainStr="",st
 cols <- function(lowi = "green", highi = "red", ncolors = 20) {
         low <- col2rgb(lowi)/255
         high <- col2rgb("black")/255
-        col1 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], 
+        col1 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2],
             high[2], len = ncolors), seq(low[3], high[3], len = ncolors))
         low <- col2rgb("black")/255
         high <- col2rgb(highi)/255
-        col2 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2], 
+        col2 <- rgb(seq(low[1], high[1], len = ncolors), seq(low[2],
             high[2], len = ncolors), seq(low[3], high[3], len = ncolors))
         col<-c(col1[1:(ncolors-1)],col2)
         return(col)
 }
-  
+
 myHeatmap<-function(x,t.colors=NA,fileName="cluster.cdt",linkage="complete",distance="pearson",contrast=2,returnSampleClust=F,rowNames=NA,rightMar=7,bottomMar=1,colNames=NA){
-	
+
 	temp<-hclust2treeview(x,method=distance,file=fileName,link=linkage,keep.hclust=T)
 	gTree<-temp[[1]]
 	sTree<-temp[[2]]
-	
+
 	imageVals<-x
 	imageVals[x > contrast] <- contrast
 	imageVals[x < -1 * contrast] <- -1 * contrast
-	
+
 	if(sum(is.na(t.colors))>0){
 			heatmap(imageVals,Rowv=as.dendrogram(gTree),Colv=as.dendrogram(sTree),
 							col=cols(),labCol=colNames, scale="none",
@@ -165,26 +165,26 @@ collapseIDs<-function(x,allids=row.names(x),method="mean"){
         ids<- levels(as.factor(allids))
         x.col<- NULL
 
-        if(length(ids)==dim(x)[1]){ 
+        if(length(ids)==dim(x)[1]){
                         dimnames(x)[[1]]<-allids
-                        return(x) 
+                        return(x)
         }
-        
+
         for(i in 1:length(ids)){
                 if(sum(allids==ids[i])>1){
-                        indices <- allids==ids[i] 
+                        indices <- allids==ids[i]
                         if(method=="mean"){
                                 vals<-apply(x[indices,],2,mean,na.rm=T)
                         }
                         if(method=="median"){
                                 vals<-apply(x[indices,],2,median,na.rm=T)
                         }
-                        if(method=="stdev"){   
+                        if(method=="stdev"){
                                 temp<- x[indices,]
                                 stdevs<- apply(temp,1,sd,na.rm=T)
                                 vals<- temp[match(max(stdevs),stdevs),]
                         }
-                        if(method=="iqr"){   
+                        if(method=="iqr"){
                                 temp<- x[indices,]
                                 iqrs<- apply(temp,1,function(x){quantile(x,.75,na.rm=T)-quantile(x,.25,na.rm=T)})
                                 vals<- temp[match(max(iqrs),iqrs),]
@@ -197,7 +197,7 @@ collapseIDs<-function(x,allids=row.names(x),method="mean"){
 
         dimnames(x.col)<- list(ids,dimnames(x)[[2]])
         return(x.col)
-        
+
 }
 
 
@@ -214,26 +214,26 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){
 			ids<-x[,1]
 			xd<-x[,-1]
 			xd<-apply(xd,2,as.numeric)
-			xd<-collapseIDs(xd,ids,method)	
+			xd<-collapseIDs(xd,ids,method)
 	}else{
 			sampleNames<-as.vector(t(x[1,-1]))
 			x<-x[-1,]
-			
+
 			classes<-x[1:(headerRows-1),]
 			dimnames(classes)[[1]]<-classes[,1]
 			classes<-classes[,-1]
 			classes[classes==""]<-NA
 			classes<-t(classes)
 			rownames(classes)<-sampleNames
 			classes<-as.data.frame(classes)
-						
+
 			xd<-x[(-1:-(headerRows-1)),]
 			ids<-as.vector(t(xd[,1]))
 			xd<-xd[,-1]
 			xd<-apply(xd,2,as.numeric)
 			xd<-collapseIDs(xd,ids,method)
 	}
-	
+
 	features<- dim(xd)[1]
 	samples<- dim(xd)[2]
 	geneNames<-rownames(xd)
@@ -249,15 +249,15 @@ readarray<-function(dataFile,designFile=NA,hr=1,impute=T,method="mean"){
 		x<-x[sort.list(rownames(x)),]
 		classes<-as.data.frame(x)
 	}
-	
+
 	if(sum(apply(xd,2,is.na))>0 & impute){
 		library(impute)
 		allAnn<-dimnames(xd)
 		data.imputed<-impute.knn(as.matrix(xd))$data
 		xd<-data.imputed[1:features,]
 		dimnames(xd)<-allAnn
 	}
-	
+
 	return(list(xd=xd, classes=classes, nfeatures=features, nsamples=samples, fnames=geneNames, snames=sampleNames))
 }
 
@@ -269,26 +269,26 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid
 	samples<- dim(x)[2]
 	sampleNames<- dimnames(x)[[2]]
 	featureNames<- dimnames(x)[[1]]
-	
+
 	#parse the test file - same as train file but no rows of classes
 	tdataMatrix<-y
 	tfeatures<- dim(y)[1]
 	tsamples<- dim(y)[2]
 	tsampleNames<- dimnames(y)[[2]]
 	tfeatureNames<- dimnames(y)[[1]]
-	
+
 	#dimnames(tdataMatrix)[[2]]<-paste("x",seq(1,471))
 	temp <- overlapSets(dataMatrix,tdataMatrix)
 	dataMatrix <- temp$x
 	tdataMatrix <- temp$y
 	sfeatureNames<-row.names(dataMatrix)
-	
+
 	# standardize both sets
 	if(std){
 		dataMatrix<-standardize(dataMatrix)
 		tdataMatrix<-standardize(tdataMatrix)
 	}
-	
+
 	if(!centroids){
 		thisClass <- as.vector(classes[,1])
 		nClasses<-nlevels(as.factor(thisClass))
@@ -299,21 +299,21 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid
 		thisClass<-as.numeric(thisClass)
 		dataMatrix <- dataMatrix[,!(is.na(thisClass))]
 		thisClass <- thisClass[!(is.na(thisClass))]
-	
+
 		scores<-apply(dataMatrix,1,bwss,thisClass)
-		trainscores<-vector()	
-		for(j in 1:dim(dataMatrix)[1]){			
+		trainscores<-vector()
+		for(j in 1:dim(dataMatrix)[1]){
 			trainscores[j]<-scores[[row.names(dataMatrix)[j]]]$bss / scores[[row.names(dataMatrix)[j]]]$wss
 		}
-		
+
 		dataMatrix<-dataMatrix[sort.list(trainscores,decreasing=T),]
-		tdataMatrix<-tdataMatrix[sort.list(trainscores,decreasing=T),]	
-		
+		tdataMatrix<-tdataMatrix[sort.list(trainscores,decreasing=T),]
+
 		if(nGenes==""){
 			nGenes<-dim(dataMatrix)[1]
 		}
 		print(paste("Number of genes used:",nGenes))
-		
+
 		dataMatrix<-dataMatrix[1:nGenes,]
 		tdataMatrix<-tdataMatrix[1:nGenes,]
 
@@ -322,15 +322,15 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid
 			centroids[,j]<-apply(dataMatrix[,thisClass==j],1,mean)
 		}
 		dimnames(centroids)<-list(row.names(dataMatrix),NULL)
-		
+
 	}else{
 		nGenes<-dim(dataMatrix)[1]
 		print(paste("Number of genes used:",nGenes))
 		centroids<-dataMatrix
 		nClasses<-dim(centroids)[2]
 		classLevels<-dimnames(centroids)[[2]]
 	}
-	
+
 	distances<-matrix(ncol=nClasses,nrow=dim(tdataMatrix)[2])
 	for(j in 1:nClasses){
 		if(distm=="euclidean"){
@@ -343,14 +343,14 @@ sspPredict<-function(x,classes="",y,nGenes="",priors="equal",std=F,distm="euclid
 			distances[,j]<-t(-1*cor(cbind(centroids[,j],tdataMatrix),method="spearman",use="pairwise.complete.obs"))[2:(dim(tdataMatrix)[2]+1)]
 		}
 	}
-	
+
 	scores<-apply(distances,1,min)
 	prediction<-vector(length=tsamples)
 	for(i in 1:tsamples){
 		prediction[i]<-classLevels[match(scores[i],distances[i,])]
 	}
 	names(prediction)<-tsampleNames
-	
+
 	return(list(predictions=prediction,testData=tdataMatrix,distances=distances,centroids=centroids))
 }
 
diff --git a/R/subtype_functions.R b/R/subtype_functions.R
@@ -12,6 +12,7 @@
 #installifnot("gplots")
 #installifnot("heatmap.plus")
 #installifnot("GMD")
+#installifnot("reshape")
 #biocIfnot("limma")
 #biocIfnot("org.Hs.eg.db")
 #biocIfnot("genefu")
@@ -23,6 +24,7 @@ library(gplots)
 library(org.Hs.eg.db)
 library(genefu)
 library(heatmap.plus)
+library(reshape)
 #library(GMD)
 
 ####################################################################
@@ -367,7 +369,7 @@ run_p50=function(){
   dev.off()
 
   #plot the correlation coefficients
-  cor_plot(pam.res,".","Samples")
+  cor_plot(pam.res,".",short)
 }
 
 ################# SCMGENE #######################

diff --git a/Data/220arrays_nonUBCcommon+12normal_50g.txt → ...a/220arrays_nonUBCcommon+12normal_50g.txt b/Data/220arrays_nonUBCcommon+12normal_50g.txt → ...a/220arrays_nonUBCcommon+12normal_50g.txt
diff --git a/Data/mediansPerDataset_v2.txt → pam50_data/mediansPerDataset_v2.txt b/Data/mediansPerDataset_v2.txt → pam50_data/mediansPerDataset_v2.txt
diff --git a/Data/pam50_annotation.txt → pam50_data/pam50_annotation.txt b/Data/pam50_annotation.txt → pam50_data/pam50_annotation.txt
diff --git a/Data/pam50_centroids.txt → pam50_data/pam50_centroids.txt b/Data/pam50_centroids.txt → pam50_data/pam50_centroids.txt