-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRun_PSJS.py
235 lines (208 loc) · 14.1 KB
/
Run_PSJS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from IGCexpansion.PSJSGeneconv import *
from IGCexpansion.JSGeneconv import *
import argparse
from collections import namedtuple
import numpy as np
def main(args):
paralog = [args.paralog1, args.paralog2]
gene_to_orlg_file = './' + '_'.join(paralog) +'_GeneToOrlg.txt'
alignment_file = './' + '_'.join(paralog) +'_Cleaned_input.fasta'
tree_newick = './input_tree.newick'
DupLosList = './EDN_ECP_DupLost.txt'
terminal_node_list = ['Tamarin', 'Macaque', 'Orangutan', 'Gorilla', 'Chimpanzee']
node_to_pos = {'D1':0}
pm_model = 'HKY'
IGC_pm = 'One rate'
seq_index_file = './' + '_'.join(paralog) +'_seq_index.txt'
if args.rate_variation:
save_file = './save/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_save.txt'
log_file = './log/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_log.txt'
summary_file = './summary/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_summary.txt'
x_js = np.log([ 0.5, 0.5, 0.5, 4.35588244, 0.5, 5.0, 0.3])
else:
save_file = './save/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_save.txt'
log_file = './log/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_log.txt'
summary_file = './summary/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_summary.txt'
x_js = np.log([ 0.5, 0.5, 0.5, 4.35588244, 0.3])
test_JS = JSGeneconv(alignment_file, gene_to_orlg_file, args.cdna, tree_newick, DupLosList, x_js, pm_model, IGC_pm,
args.rate_variation, node_to_pos, terminal_node_list, save_file)
test_JS.get_mle()
#test_JS.get_expectedNumGeneconv()
test_JS.get_individual_summary(summary_file)
if args.dim == 1:
save_file = './save/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
log_file = './log/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
summary_file = './summary/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
elif args.dim == 2:
save_file = './save/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
log_file = './log/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
summary_file = './summary/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
else:
save_file = './save/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
log_file = './log/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
summary_file = './summary/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
gradient_file = './summary/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_gradient.txt'
hessian_file = './summary/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_hessian.txt'
godambe_file = './summary/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_godambe.txt'
if args.rate_variation:
if args.allow_same_codon:
save_file = save_file.replace('_nonclock', '_rv_SCOK_nonclock')
log_file = log_file.replace('_nonclock', '_rv_SCOK_nonclock')
summary_file = summary_file.replace('_nonclock', '_rv_SCOK_nonclock')
gradient_file = gradient_file.replace('_nonclock', '_rv_SCOK_nonclock')
hessian_file = hessian_file.replace('_nonclock', '_rv_SCOK_nonclock')
godambe_file = godambe_file.replace('_nonclock', '_rv_SCOK_nonclock')
else:
save_file = save_file.replace('_nonclock', '_rv_NOSC_nonclock')
log_file = log_file.replace('_nonclock', '_rv_NOSC_nonclock')
summary_file = summary_file.replace('_nonclock', '_rv_NOSC_nonclock')
gradient_file = summary_file.replace('_nonclock', '_rv_NOSC_nonclock')
hessian_file = hessian_file.replace('_nonclock', '_rv_NOSC_nonclock')
godambe_file = godambe_file.replace('_nonclock', '_rv_NOSC_nonclock')
force = None
x_js = np.concatenate((test_JS.jsmodel.x_js[:-1], \
[ test_JS.jsmodel.x_js[-1] - np.log(args.tract_length), - np.log(args.tract_length) ]))
test = PSJSGeneconv(alignment_file, gene_to_orlg_file, seq_index_file, args.cdna, args.allow_same_codon, tree_newick, DupLosList, x_js, pm_model, IGC_pm,
args.rate_variation, node_to_pos, terminal_node_list, save_file, log_file, force)
## x = np.concatenate((test_JS.jsmodel.x_js[:-1], \
## [ test_JS.jsmodel.x_js[-1] - np.log(args.tract_length), - np.log(args.tract_length) ],
## test_JS.x[len(test_JS.jsmodel.x_js):]))
## test.unpack_x(x)
if args.dim == 1:
test.optimize_x_IGC(dimension = 1)
elif args.dim == 2:
test.optimize_x_IGC(dimension = 2)
else:
test.get_mle()
test.get_individual_summary(summary_file)
pairwise_lnL_summary_file = summary_file.replace('_summary.txt', '_lnL_summary.txt')
test.get_pairwise_loglikelihood_summary(pairwise_lnL_summary_file)
Godambe_x = np.array([test.psjsmodel.x_IGC[0] - test.psjsmodel.x_IGC[1], test.psjsmodel.x_IGC[1] - np.log(1.0 - np.exp(test.psjsmodel.x_IGC[1]))])
godambe = test.get_Godambe_matrix(Godambe_x)
np.savetxt(open(godambe_file, 'w+'), np.array(godambe))
test.get_gradient_hessian(Godambe_x, gradient_file, hessian_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--paralog1', required = True, help = 'Name of the 1st paralog')
parser.add_argument('--paralog2', required = True, help = 'Name of the 2nd paralog')
parser.add_argument('--L', type = float, dest = 'tract_length', default = 30.0, help = 'Initial guess tract length')
parser.add_argument('--D', type = int, dest = 'dim', default = 0, help = 'Dimension used in search with default value 0')
parser.add_argument('--heterogeneity', dest = 'rate_variation', action = 'store_true', help = 'rate heterogeneity control')
parser.add_argument('--homogeneity', dest = 'rate_variation', action = 'store_false', help = 'rate heterogeneity control')
parser.add_argument('--coding', dest = 'cdna', action = 'store_true', help = 'coding sequence control')
parser.add_argument('--noncoding', dest = 'cdna', action = 'store_false', help = 'coding sequence control')
parser.add_argument('--samecodon', dest = 'allow_same_codon', action = 'store_true', help = 'whether allow pair sites from same codon')
parser.add_argument('--no-samecodon', dest = 'allow_same_codon', action = 'store_false', help = 'whether allow pair sites from same codon')
main(parser.parse_args())
## MyStruct = namedtuple('MyStruct', 'paralog1 paralog2 tract_length dim cdna rate_variation allow_same_codon')
## args = MyStruct(paralog1 = 'EDN', paralog2 = 'ECP', tract_length = 30.0, dim = 1, cdna = True, rate_variation = True, allow_same_codon = True)
##
## paralog = [args.paralog1, args.paralog2]
##
## gene_to_orlg_file = './' + '_'.join(paralog) +'_GeneToOrlg.txt'
## alignment_file = './' + '_'.join(paralog) +'_Cleaned_input.fasta'
##
## tree_newick = './input_tree.newick'
## DupLosList = './EDN_ECP_DupLost.txt'
## terminal_node_list = ['Tamarin', 'Macaque', 'Orangutan', 'Gorilla', 'Chimpanzee']
## node_to_pos = {'D1':0}
## pm_model = 'HKY'
##
## IGC_pm = 'One rate'
## seq_index_file = './' + '_'.join(paralog) +'_seq_index.txt'
##
## if args.rate_variation:
## save_file = './save/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_save.txt'
## log_file = './log/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_log.txt'
## summary_file = './summary/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_summary.txt'
## x_js = np.log([ 0.5, 0.5, 0.5, 4.35588244, 0.5, 5.0, 0.3])
## else:
## save_file = './save/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_save.txt'
## log_file = './log/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_log.txt'
## summary_file = './summary/JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_nonclock_summary.txt'
## x_js = np.log([ 0.5, 0.5, 0.5, 4.35588244, 0.3])
##
##
## test_JS = JSGeneconv(alignment_file, gene_to_orlg_file, args.cdna, tree_newick, DupLosList, x_js, pm_model, IGC_pm,
## args.rate_variation, node_to_pos, terminal_node_list, save_file, log_file = log_file)
## test_JS.get_mle()
## #test_JS.get_expectedNumGeneconv()
## test_JS.get_individual_summary(summary_file)
##
## if args.dim == 1:
## save_file = './save/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
## log_file = './log/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
## summary_file = './summary/PSJS_dim_1_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
## elif args.dim == 2:
## save_file = './save/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
## log_file = './log/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
## summary_file = './summary/PSJS_dim_2_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
## else:
## save_file = './save/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_save.txt'
## log_file = './log/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_log.txt'
## summary_file = './summary/PSJS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_init_' + str(args.tract_length) + '_nonclock_summary.txt'
##
## if args.rate_variation:
## if args.allow_same_codon:
## save_file = save_file.replace('_nonclock', '_rv_SCOK_nonclock')
## log_file = log_file.replace('_nonclock', '_rv_SCOK_nonclock')
## summary_file = summary_file.replace('_nonclock', '_rv_SCOK_nonclock')
## else:
## save_file = save_file.replace('_nonclock', '_rv_NOSC_nonclock')
## log_file = log_file.replace('_nonclock', '_rv_NOSC_nonclock')
## summary_file = summary_file.replace('_nonclock', '_rv_NOSC_nonclock')
##
##
## force = None
##
## x_js = np.concatenate((test_JS.jsmodel.x_js[:-1], \
## [ test_JS.jsmodel.x_js[-1] - np.log(args.tract_length), - np.log(args.tract_length) ]))
## test = PSJSGeneconv(alignment_file, gene_to_orlg_file, seq_index_file, args.cdna, args.allow_same_codon, tree_newick, DupLosList, x_js, pm_model, IGC_pm,
## args.rate_variation, node_to_pos, terminal_node_list, save_file, log_file, force)
#### x = np.concatenate((test_JS.jsmodel.x_js[:-1], \
#### [ test_JS.jsmodel.x_js[-1] - np.log(args.tract_length), - np.log(args.tract_length) ],
#### test_JS.x[len(test_JS.jsmodel.x_js):]))
#### test.unpack_x(x)
##
## if args.dim == 1:
## test.optimize_x_IGC(dimension = 1)
## elif args.dim == 2:
## test.optimize_x_IGC(dimension = 2)
## else:
## test.get_mle()
## test.get_individual_summary(summary_file)
##
## pairwise_lnL_summary_file = summary_file.replace('_summary.txt', '_lnL_summary.txt')
## test.get_pairwise_loglikelihood_summary(pairwise_lnL_summary_file)
## pairs = []
## all_pairs = '../Filtered_pairs.txt'
## with open(all_pairs, 'r') as f:
## for line in f.readlines():
## pairs.append(line.replace('\n','').split('_'))
##
## for pair in pairs:
## paralog = pair
##
## gene_to_orlg_file = '../GeneToOrlg/' + '_'.join(paralog) +'_GeneToOrlg.txt'
## alignment_file = '../MafftAlignment/' + '_'.join(paralog) +'/' + '_'.join(paralog) +'_input.fasta'
##
## tree_newick = '../YeastTree.newick'
## DupLosList = '../YeastTestDupLost.txt'
## terminal_node_list = ['kluyveri', 'castellii', 'bayanus', 'kudriavzevii', 'mikatae', 'paradoxus', 'cerevisiae']
## node_to_pos = {'D1':0}
## pm_model = 'HKY'
##
## IGC_pm = 'One rate'
## rate_variation = True
## cdna = True
##
## save_file = './save/Force_JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_save.txt'
## log_file = './log/Force_JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_log.txt'
## summary_file = './summary/Force_JS_HKY_'+ '_'.join(paralog) + '_' + IGC_pm.replace(' ', '_') + '_rv_nonclock_summary.txt'
## x_js = np.log([ 0.5, 0.5, 0.5, 4.35588244, 0.5, 5.0, 0.3])
## force = {6:0.0}
##
## test_JS = JSGeneconv(alignment_file, gene_to_orlg_file, cdna, tree_newick, DupLosList, x_js, pm_model, IGC_pm,
## rate_variation, node_to_pos, terminal_node_list, save_file, force)
## test_JS.get_mle()
## test_JS.get_individual_summary(summary_file)