-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsubtypePrediction_distributed.R
executable file
·195 lines (161 loc) · 8.26 KB
/
subtypePrediction_distributed.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# this code & protocol has been validated externally by MDACC
#1. Download normalized expression data
#2. Log transform expression estimates
#3. Optionally median center or appropriately adjust each probeset, which carries the population assumption
#4. Map probesets to Entrez gene names
#5. Format as tab delimited text, with first row of sample names and first column of gene names
#---- This software then provides the following steps
#5. For probesets that map to identical Entrez gene names, select the one with highest IQR (for Affy, select mean for Agilent)
#6. Extract the 50 genes of interest (in pam50_centroids.txt)
#7. Calculate Spearman's rank correlation between each sample and each subtype centroid (in pam50_centroids.txt)
#8. Assign the class of the most highly correlated centroid to each sample
pam50_wrapper=function(outDir,inputFile,short){
calibrationFile<- system.file("extdata","mediansPerDataset_v2.txt",package="BASALT")
trainCentroids<- system.file("extdata","pam50_centroids.txt",package="BASALT")
trainFile<<- system.file("extdata","220arrays_nonUBCcommon+12normal_50g.txt",package="BASALT")
proliferationGenes<-c("CCNB1","UBE2C","BIRC5","KNTC2","CDC20","PTTG1","RRM2","MKI67","TYMS","CEP55","CDCA1")
stdArray<-T # just for visualization, and only set to F if many missing genes
predFiles<- inputFile
###
# some constants
###
# for subtype only model
glthreshold<- -0.15
ghthreshold<- 0.1
# for subtype + proliferation model
gplthreshold<- -0.25
gphthreshold<- 0.1
# for combined model
clthreshold<- -0.1
chthreshold<- 0.2
# for combined + proliferation model
cplthreshold<- -0.2
cphthreshold<- 0.2
# begin analyses
# only need train data for visualizations
#print(paste0("reading trainFile - ",trainFile))
x<-readarray(trainFile,hr=2)
print(summary(x))
x$xd<-standardize(medianCtr(x$xd))
# load the published centroids for classifcation
pamout.centroids<-read.table(trainCentroids,sep="\t",header=T,row.names=1)
pdfname1<-paste(outDir,paste("predictionScores_pam50RankCorrelation_1_",short,".pdf",sep=""),sep="/")
pdfname2<-paste(outDir,paste("predictionScores_pam50RankCorrelation_2_",short,".pdf",sep=""),sep="/")
clustername<-paste(outDir,paste(short,"_PAM50_normalized_heatmap",sep=""),sep="/")
outFile<- paste(outDir,paste(short,"_pam50scores.txt",sep=""),sep="/")
# read in the data file
if(hasClinical){
xhr=2
}else{
xhr=1
}
y<-readarray(predFiles,hr=xhr,method=collapseMethod,impute=F)
# normalization
if(is.na(calibrationParameters)){
y$xd<-medianCtr(y$xd)
}else{
if(calibrationParameters != -1){
medians<-readarray(calibrationFile,hr=1)
print(paste("calibration to:",dimnames(medians$xd)[[2]][calibrationParameters]))
tm<-overlapSets(medians$xd,y$xd)
y$xd<-(tm$y-tm$x[,calibrationParameters])
#y$xd<-(tm$y-tm$x[,calibrationParameters])/tm$x[,15]
}
}
num.missing<- NA
if(stdArray){
y$xd<-standardize(y$xd)
}
erScore<-as.vector(t(y$xd["ESR1",]))
her2Score<-as.vector(t(y$xd["ERBB2",]))
# assign the subtype scores and calculate the proliferation score
this.proliferationGenes<-dimnames(y$xd)[[1]] %in% proliferationGenes
prolifScore<-apply(y$xd[this.proliferationGenes,],2,mean,na.rm=T)
out<-sspPredict(pamout.centroids,classes="",y$xd,std=F,distm="spearman",centroids=T)
out$distances<- -1*out$distances
call.conf<-c()
for(j in 1:length(out$predictions)){
call.conf[j]<- 1-cor.test(out$testData[,j],out$centroids[,which(colnames(pamout.centroids)==out$predictions[j])],method="spearman")$p.value
}
call.conf<-round(call.conf,2)
# calculate the risk scores
genomic <- 0.04210193*out$distances[,1] + 0.12466938*out$distances[,2] + -0.35235561*out$distances[,3] + 0.14213283*out$distances[,4]
genomicWprolif <- -0.0009299747*out$distances[,1] + 0.0692289192*out$distances[,2] + -0.0951505484*out$distances[,3] + 0.0493487685*out$distances[,4] + 0.3385116381*prolifScore
if(hasClinical){
xT<-as.numeric(as.vector(y$classes$T))
combined <- 0.0442770*out$distances[,1] + 0.1170297*out$distances[,2] + -0.2608388*out$distances[,3] + 0.1055908*out$distances[,4] + 0.1813751*xT
combinedWprolif <- -0.009383416*out$distances[,1] + 0.073725503*out$distances[,2] + -0.090436516*out$distances[,3] + 0.053013865*out$distances[,4] + 0.131605960*xT + 0.327259375*prolifScore
}
# threshold the risk score
griskgroups<-genomic
griskgroups[genomic>ghthreshold]<-"high"
griskgroups[genomic>glthreshold & genomic<ghthreshold]<-"med"
griskgroups[genomic<glthreshold]<-"low"
gpriskgroups<-genomicWprolif
gpriskgroups[genomicWprolif>gphthreshold]<-"high"
gpriskgroups[genomicWprolif>gplthreshold & genomicWprolif<gphthreshold]<-"med"
gpriskgroups[genomicWprolif<gplthreshold]<-"low"
genomic<- 100* (genomic + 0.35 ) / 0.85
genomicWprolif<- 100* (genomicWprolif + 0.35 ) / 0.85
# write output files
if(hasClinical){
criskgroups<-combined
criskgroups[combined>chthreshold]<-"high"
criskgroups[combined>clthreshold & combined<chthreshold]<-"med"
criskgroups[combined<clthreshold]<-"low"
cpriskgroups<-combinedWprolif
cpriskgroups[combinedWprolif>cphthreshold]<-"high"
cpriskgroups[combinedWprolif>cplthreshold & combinedWprolif<cphthreshold]<-"med"
cpriskgroups[combinedWprolif<cplthreshold]<-"low"
combined<- 100* (combined + 0.35 ) / 0.85
combinedWprolif<- 100* (combinedWprolif + 0.35 ) / 0.85
outtable<-cbind(out$distances, out$predictions, call.conf, genomic, griskgroups, prolifScore, genomicWprolif, gpriskgroups, combined, criskgroups, combinedWprolif, cpriskgroups, erScore, her2Score)
dimnames(outtable)[[2]]<-c("Basal","Her2","LumA","LumB","Normal","Call","Confidence",
"ROR-S (Subtype Only)","ROR-S Group (Subtype Only)","Proliferation Score",
"ROR-P (Subtype + Proliferation)","ROR-P Group (Subtype + Proliferation)",
"ROR-C (Subtype + Clinical)","ROR-C Group (Subtype + Clinical)",
"ROR-PC (Subtype + Clinical + Proliferation)","ROR-PC Group (Subtype + Clinical + Proliferation)",
"ER","Her2")
}else{
outtable<-cbind(out$distances, out$predictions, call.conf, genomic, griskgroups, prolifScore, genomicWprolif, gpriskgroups, erScore, her2Score)
dimnames(outtable)[[2]]<-c("Basal","Her2","LumA","LumB","Normal","Call","Confidence",
"ROR-S (Subtype Only)","ROR-S Group (Subtype Only)","Proliferation Score",
"ROR-P (Subtype + Proliferation)","ROR-P Group (Subtype + Proliferation)",
"ER","Her2")
}
write.table(outtable,outFile,sep="\t",col.names=NA)
# make some plots for evaluation
print(paste("ER range:",quantile(erScore,.9,na.rm=T)-quantile(erScore,.1,na.rm=T)))
subtypeColors<-out$predictions
subtypeColors[subtypeColors=="Basal"]<-"red"
subtypeColors[subtypeColors=="Her2"]<-"hotpink"
subtypeColors[subtypeColors=="LumA"]<-"darkblue"
subtypeColors[subtypeColors=="LumB"]<-"skyblue"
subtypeColors[subtypeColors=="Normal"]<-"green"
conf.colors<-call.conf
conf.colors[call.conf>=0.95]<-"black"
conf.colors[call.conf<0.95]<-"red"
pdf(paste(clustername,".pdf",sep=""))
myHeatmap(out$testData,cbind(subtypeColors,conf.colors),file=paste(clustername,".cdt",sep=""),rowNames=rownames(out$testData))
dev.off()
pdf(pdfname1,height=10,width=12)
pars<-par(no.readonly=T)
myplot(out,short,prolifScore)
dev.off()
tm<-overlapSets(x$xd,y$xd)
tm$x<-tm$x[,!is.na(x$classes$subtype)]
tm<-cbind(tm$x, impute.knn(as.matrix(tm$y))$data)
classes<-matrix(nrow=4,ncol=dim(tm)[2])
nTrainSamples<-length(x$classes$subtype[!is.na(x$classes$subtype)])
classes[1,]<-c(rep("train",nTrainSamples),rep(short,dim(y$xd)[2]))
classes[2,]<-c(x$classes$subtype[!is.na(x$classes$subtype)],rep(NA,dim(tm)[2]-dim(x$xd[,!is.na(x$classes$subtype)])[2]))
classes[3,]<-c(rep(NA,dim(tm)[2]-length(out$predictions)),out$predictions)
pdf(pdfname2,height=6,width=12)
tm<-scale(tm,center=F)
par(mfrow=c(1,3))
pcaEA(tm,classes[1,],mainStr="Traing and Test sets",showNames=F,showClasses=F)
pcaEA(tm[,!is.na(as.vector(t(classes[2,])))],classes[2,!is.na(as.vector(t(classes[2,])))],mainStr="Training cases",showNames=F,showClasses=F,groupColors=c("red","hotpink","darkblue","skyblue","green"))
pcaEA(tm[,!is.na(as.vector(t(classes[3,])))],classes[3,!is.na(as.vector(t(classes[3,])))],mainStr="Test cases",showNames=F,showClasses=F,groupColors=c("red","hotpink","darkblue","skyblue","green"))
par(pars)
dev.off()
}