-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvalidate_bootstrap.py
executable file
·91 lines (74 loc) · 3.48 KB
/
validate_bootstrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python3
import commanderline.commander_line as cl
import pandas as pd
import gzip
ancestry_translation = { "ARABIAN" : "NEAREAST",
"ASHKENAZI" : "ASHKENAZI-EMED",
"BALOCHI-MAKRANI-BRAHUI" : "CASIA",
"BANTUKENYA" : "EAFRICA",
"BANTUNIGERIA" : "WAFRICA",
"BIAKA" : "CAFRICA",
"CAFRICA" : "CAFRICA",
"CAMBODIA-THAI" : "SEASIA",
"CSAMERICA" : "AMERICAS",
"CYPRUS-MALTA-SICILY" : "ASHKENAZI-EMED",
"EAFRICA" : "EAFRICA",
"EASIA" : "EASIA",
"EASTSIBERIA" : "NEASIA",
"FINNISH" : "NEEUROPE",
"GAMBIA" : "WAFRICA",
"GUJARAT" : "SASIA",
"HADZA" : "EAFRICA",
"HAZARA-UYGUR-UZBEK" : "CASIA",
"ITALY-BALKANS" : "ITALY-BALKANS",
"JAPAN-KOREA" : "EASIA",
"KALASH" : "CASIA",
"MENDE" : "WAFRICA",
"NAFRICA" : "NAFRICA",
"NCASIA" : "NCASIA",
"NEAREAST" : "NEAREAST",
"NEASIA" : "NEASIA",
"NEEUROPE" : "NEEUROPE",
"NEUROPE" : "NEUROPE",
"NGANASAN" : "NEASIA",
"OCEANIA" : "OCEANIA",
"PATHAN-SINDHI-BURUSHO" : "CASIA",
"SAFRICA" : "SAFRICA",
"SAMERICA" : "AMERICAS",
"SARDINIA" : "SWEUROPE",
"SEASIA" : "SEASIA",
"SSASIA" : "SASIA",
"SWEUROPE" : "SWEUROPE",
"TAIWAN" : "SEASIA",
"TUBALAR" : "NEASIA",
"TURK-IRAN-CAUCASUS" : "TURK-IRAN-CAUCASUS"
}
def validate_bootstrap(file_list, negligible_threshold=0.01, bootstrap_confidence=0.8):
'''
+--------------------+
| validate_bootstrap |
+--------------------+
Script that takes into account bootstrap replicates and outputs estimated "super-ancestries". Main input is a text file, where the first line is the .Q output filename of a regular Ancestry run and remaining lines are the .Q files of bootstrap replicates.
Details about Ancestry:
https://bitbucket.org/joepickrell/ancestry
Package dependencies:
pandas
commanderline
'''
results=pd.DataFrame()
with open(file_list, 'rt') as f_in:
for i,f in enumerate(f_in):
f=f.strip()
d1=pd.read_csv(f, header=None, sep=' ')
d1['super']=d1[0].apply(lambda el: ancestry_translation[el])
d2=d1.groupby('super').sum()
d2.columns=[i]
results=pd.concat([results, d2], axis=1)
results=results.dropna(subset=[0])
bootstrap_replicate_count=results.iloc[:,1:].apply(lambda el: el>=negligible_threshold).T.sum()/(len(results.columns)-1)
filt=bootstrap_replicate_count.apply(lambda el: el>=bootstrap_confidence)
interm_results=results[filt][0]
ambig=1-interm_results.sum()
final_results=pd.concat([interm_results, pd.Series({'AMBIGUOUS':ambig})])
print(final_results)
cl.commander_line(validate_bootstrap) if __name__ == '__main__' else None