-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrelax_parse.py
120 lines (103 loc) · 4.06 KB
/
relax_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
###############################################
# Author: B. Anderson
# Date: Aug 2023
# Description: parse the results of a series of RELAX runs to select optimal values
# This assumes the results have already been tabulated in csv files, with a column of samples,
# and three columns per gene corresponding to K, p and LogL of the RELAX alternative model
# Arguments are the csv files (one per run; at least two), which should have the same unique taxa and genes
###############################################
import sys
import argparse
# instantiate the parser
parser = argparse.ArgumentParser(description = 'A script to parse tabulated (csv file) RELAX outputs')
# add arguments to parse
parser.add_argument('results', type = str, nargs = '*', help = 'The csv result files; these need to have ' +
'the same samples and genes in each.')
# parse the command line
if len(sys.argv[1:]) == 0: # if there are no arguments
parser.print_help(sys.stderr)
sys.exit(1)
args = parser.parse_args()
results = args.results
if len(results) < 2:
print('Please provide at least two run results files\n')
parser.print_help(sys.stderr)
sys.exit(1)
# Read in the results
gene_list = []
taxa = []
master_list = []
for fileno, result_file in enumerate(results):
with open(result_file, 'r') as infile:
for lineno, line in enumerate(infile):
parts = line.strip().split(',')
if lineno == 0:
gene_entries = parts[1: ]
if fileno == 0: # first file
for gene in gene_entries:
if gene not in gene_list:
gene_list.append(gene)
else:
continue
elif len(gene_list) != (len(gene_entries) / 3):
print('Wrong number of gene entries in ' + str(result_file))
parser.print_help(sys.stderr)
sys.exit(1)
else:
taxon = parts[0]
if fileno == 0: # first file
taxa.append(taxon)
elif taxon not in taxa:
print('Wrong taxon in ' + str(result_file))
parser.print_help(sys.stderr)
sys.exit(1)
master_list.append([taxon, parts[1: ]])
# Go through the samples in the list and choose the best values for each sample
# Determine the p-value significance and indicate it where appropriate
# Round the K value
# Count how many significant runs had K < 1 and how many had K > 1 (to assess bimodality)
out_list = []
for taxon in taxa:
top_list = []
result_list = [item[1] for item in master_list if item[0] == taxon] # grab the runs for that taxon
for index, gene in enumerate(gene_list):
if result_list[0][index * 3] == '-': # gene missing
top_list.append(['-', '-'])
else:
Kvals = [float(item[index * 3 + 0]) for item in result_list]
pvals = [float(item[index * 3 + 1]) for item in result_list]
Lvals = [float(item[index * 3 + 2]) for item in result_list]
# determine top likelihood and corresponding K and p for reporting
max_index = Lvals.index(max(Lvals))
if pvals[max_index] < 0.05:
if pvals[max_index] < 0.01:
if pvals[max_index] < 0.001:
signif = '< 0.001'
top_K = str(round(Kvals[max_index], 1)) + '***'
else:
signif = '< 0.01'
top_K = str(round(Kvals[max_index], 1)) + '**'
else:
signif = '< 0.05'
top_K = str(round(Kvals[max_index], 1)) + '*'
else:
signif = 'n.s.'
top_K = str(round(Kvals[max_index], 1))
# determine how many significant runs had divergent K values (< 1 and > 1)
sig_indices = [i for i, pval in enumerate(pvals) if pval < 0.05]
if len(sig_indices) == 0: # no significant runs
count_blurb = 'n.s.'
else:
sig_Kvals = [Kvals[i] for i in sig_indices]
count_greater = len([item for item in sig_Kvals if item >= 1])
count_less = len([item for item in sig_Kvals if item < 1])
count_blurb = str(count_less) + '/' + str(count_greater) + \
' of ' + str(count_greater + count_less)
# record
top_list.append([top_K, count_blurb])
out_list.append([taxon, top_list])
# Print the output as csv (can be directed to a file with ">")
print('Sample,' + ','.join([(item + ',' + item) for item in gene_list]))
for entry in out_list:
print(entry[0] + ',' + ','.join([','.join(item) for item in entry[1]]))