-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathCTD.r
200 lines (185 loc) · 8.77 KB
/
CTD.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env Rscript --vanilla
library(argparser)
require(huge)
require(MASS)
library(rjson)
library(stringr)
library(fs)
require(igraph)
source("./R/mle.getEncodingLength.r")
source("./R/mle.getPtBSbyK.r")
source("./R/data.surrogateProfiles.r")
source("./R/data.imputeData.r")
source("./R/singleNode.getNodeRanksN.r")
source("./R/graph.diffuseP1.r")
source("./R/graph.connectToExt.r")
source("./R/stat.fishersMethod.r")
p <- arg_parser("Connect The Dots - Find the most connected sub-graph")
p <- add_argument(p, "--experimental", help="Experimental dataset file name", default = '') # data/example_argininemia/experimental.csv
p <- add_argument(p, "--control", help="Control dataset file name", default = '') # data/example_argininemia/control.csv
p <- add_argument(p, "--adj_matrix", help="CSV with adjacency matrix", default = '') # data/example_argininemia/adj.csv
p <- add_argument(p, "--s_module", help="Comma-separated list or path to CSV of graph G nodes to consider when searching for the most connected sub-graph")
p <- add_argument(p, "--kmx", help="Number of highly perturbed nodes to consider. Ignored if S module is given.", default=15)
p <- add_argument(p, "--present_in_perc_for_s", help="Percentage of patients having metabolite for selection of S module. Ignored if S module is given.", default=0.5)
p <- add_argument(p, "--output_name", help="Name of the output JSON file.")
p <- add_argument(p, "--out_graph_name", help="Name of the output graph adjecancy CSV file.")
p <- add_argument(p, "--glasso_criterion", help="Graph-ical Lasso prediction of the graph criterion. stars is default, ebic is faster.", default='stars')
argv <- parse_args(p)
# Read input dataframe with experimental (positive, disease) samples
if (file.exists(argv$experimental)){
experimental_df <- read.csv(file = argv$experimental, check.names=FALSE, row.names=1)
experimental_df[] <- lapply(experimental_df, as.character) # TODO remove?
# Read input dataframe with control samples
control_data <- read.csv(file = argv$control, check.names=FALSE, row.names=1)
target_patients = colnames(experimental_df)
# TODO: Do we need surrogates?
# Add surrogate disease and surrogate reference profiles based on 1 standard
# deviation around profiles from real patients to improve rank of matrix when
# learning Gaussian Markov Random Field network on data. While surrogate
# profiles are not required, they tend to learn less complex networks
# (i.e., networks with less edges) and in faster time.
experimental_df=data.surrogateProfiles(experimental_df, 1, ref_data = control_data)
experimental_df = as.data.frame(experimental_df)
} else{
target_patients = NA
}
# Read input graph (adjacency matrix)
if (file.exists(argv$adj_matrix)){
adj_df <- read.csv(file = argv$adj_matrix, check.names=FALSE)
rownames(adj_df) = colnames(adj_df)
adj_df = as.matrix(adj_df)
} else{
experimental = huge(t(experimental_df), method="glasso")
# This will take several minutes. For a faster option, you can use the
# "ebic" criterion instead of "stars", but we recommend "stars".
message('Starting huge.select.')
experimental.select = huge.select(experimental, criterion=argv$glasso_criterion)
adj_df = as.matrix(experimental.select$opt.icov)
diag(adj_df) = 0
rownames(adj_df) = rownames(experimental_df)
colnames(adj_df) = rownames(experimental_df)
if (!is.na(argv$out_graph_name)){
write.table(adj_df, file = argv$out_graph_name, row.names=FALSE, sep=',')
}
}
message('Convert adjacency matrices to an igraph object.')
igraph = graph.adjacency(adj_df, mode="undirected", weighted=TRUE,
add.colnames="name")
# The Encoding Process
adj_mat = as.matrix(get.adjacency(igraph, attr="weight"))
G=vector(mode="list", length=length(V(igraph)$name))
G[1:length(G)] = 0
names(G) = V(igraph)$name
## Choose node subset
kmx=argv$kmx # Maximum subset size to inspect
S_set = list()
if (is.na(argv$s_module)){
for (pt in target_patients) {
sel = experimental_df[[pt]]
temp = experimental_df[order(abs(sel), decreasing=TRUE), ]
S_patient=rownames(temp)[1:kmx]
S_set<-append(S_set, S_patient)
} # Created list containing top kmx metabolites for every taget user
vec_s = unlist(S_set)
occurances = as.data.frame(table(vec_s))
# Keep in the S module the metabolites perturbed in at least 50% patients
S_perturbed_nodes_ind = which(occurances$Freq >= (length(target_patients) * argv$present_in_perc_for_s))
S_perturbed_nodes = as.list(as.character(occurances[S_perturbed_nodes_ind, "vec_s"]))
} else if (file.exists(argv$s_module)){
s_module_df <- read.csv(file = argv$s_module, check.names=FALSE)
S_perturbed_nodes <- as.list(as.character(s_module_df[ , ncol(s_module_df)]))
} else
{
S_perturbed_nodes = as.list(unlist(str_split(argv$s_module, ',')))
# TODO: Check if all given nodes are in G!
}
message(paste('Selected perturbed nodes, S = : ', paste(S_perturbed_nodes, collapse=',')))
## Check if all nodes from the S module are in graph
for (s_node in S_perturbed_nodes){
if (!(s_node %in% rownames(adj_mat))){
stop(paste('Node ', s_node, ' not in graph. Exiting program.'))
}
}
## Walk through all the nodes in S module
message('Get the single-node encoding node ranks starting from each node')
ranks = list()
for (n in 1:length(S_perturbed_nodes)) {
ind = which(names(G)==S_perturbed_nodes[n])
# probability_difussion starting from node with index ind
ranks[[n]]=singleNode.getNodeRanksN(ind,G,p1=1.0,thresholdDiff=0.01,
adj_mat,S=S_perturbed_nodes,
num.misses=log2(length(G)),TRUE)
}
names(ranks) = S_perturbed_nodes
# Vector ranks contains encodings for each node in S_perturbed_nodes
## Convert to bitstrings
# Get the bitstrings associated with the disease module's metabolites
ptBSbyK = mle.getPtBSbyK(unlist(S_perturbed_nodes), ranks)
## Get encoding length of minimum length code word.
# experimental_df is dataframe with diseases (and surrogates)
# and z-values for each metabolite
# TODO: If graph and disease module are given do we still need experimental_df?
if (file.exists(argv$experimental)){
ind = which(colnames(experimental_df) %in% target_patients)
data_mx.pvals=apply(experimental_df[,ind], c(1,2),
function(i) 2*pnorm(abs(i), lower.tail=FALSE))
# p-value is area under curve of normal distribution on the right of the
# specified z-score. pnorm generates normal distribution with mean=0, std=1,
# which is exactly what z-scores are
}else{
data_mx.pvals = list(0)
}
# If we have here specific Patient ID the function will calculate
# Fisher fishers.Info and varPvalue but we'll pass NULL for now
ptID = colnames(data_mx.pvals)[1]
res = mle.getEncodingLength(ptBSbyK, t(data_mx.pvals), NULL, G)
# returns a subset of nodes that are highly connected
ind.mx = which(res$d.score == max(res$d.score) )
highest_dscore_paths = res[ind.mx,]
## Locate encoding (F) with best d-score
# Tiebraker 1: If several results have the same d-score take one with longest BS
ind_F = which(nchar(highest_dscore_paths$optimalBS) ==
max(nchar(highest_dscore_paths$optimalBS)))
ind_F = highest_dscore_paths[ind_F,]
# Tiebraker 2: Take the one with the largest subsetSize
index_highest = which(ind_F$subsetSize ==
max(ind_F$subsetSize))
ind_F = ind_F[index_highest,]
ind_F = rownames(ind_F)
F_info = res[ind_F,]
# You can interpret the probability assigned to this metabolite set by
# comparing it to a null encoding algorithm, which uses fixed-length codes
# for all metabolites in the set. The "d.score" is the difference in bitlength
# Significance theorem, we can estimate the upper bounds on a p-value by
# 2^-d.score.
p_value_F = 2^-F_info$d.score
S_perturbed_nodes # All metabolites in S
ptBSbyK[[ind_F]] # all metabolites in the bitstring
# just the F metabolites that are in S_arg that were were "found"
F_arr = ptBSbyK[[strtoi(ind_F)]]
Fs= which(F_arr==1)
Fs= names(Fs)
message(paste('Set of highly-connected perturbed metabolites F = {', toString(Fs),
'} with p-value = ', p_value_F))
kmcm_probability = 2^-nchar(F_info$optimalBS)
optimal_bitstring = F_info$optimalBS
out_dict <- list(S_perturbed_nodes = S_perturbed_nodes,
F_most_connected_nodes = Fs,p_value = p_value_F,
kmcm_probability = kmcm_probability,
optimal_bitstring = optimal_bitstring,
number_of_nodes_in_G = length(G))
res_json = toJSON(out_dict, indent = 4)
if (is.na(argv$output_name)){
if (argv$experimental == ''){
outfname = fs::path_file(argv$adj_matrix)
}else{
outfname = fs::path_file(argv$experimental)
}
outfname = str_replace(outfname, 'csv', 'json')
}else{
outfname = argv$output_name
}
write(res_json, outfname)
res_ranks = toJSON(ranks)
outrname = str_replace(outfname, '.json', '_ranks.json')
write(res_ranks, outrname)