-
Notifications
You must be signed in to change notification settings - Fork 2
/
subscript_lineage2taxonomyTrain.py
123 lines (122 loc) · 4.35 KB
/
subscript_lineage2taxonomyTrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
#used to convert a taxonomy in tab-delimited file containing the taxonomic hierarchical structure to RDP Classifier taxonomy training file
#Approach:each taxon is uniquely identified by the combination of its tax id and depth from the root rank, its attributes comprise: name, parent taxid, and level of depth from the root rank.
import os
def lin2tax(file_base, format, dup=False):
print("\n\tTraining Taxonomy")
if dup:
print("\n\tDuplicate taxa being handled with numerical suffices")
with open(file_base+"__RDP_taxonomy.txt", 'r') as f:
line = f.readline()
cols = line.strip().split('\t')[1:] # Split the first line into columns
hash = {}#taxon name-id map
ranks = {}#column number-rank map
hash = {"Root":0}#initiate root rank taxon id map
for i in range(len(cols)): # Assign ranks based on column headers
ranks[i] = cols[i]
root = ['0', 'Root', '-1', '0', 'rootrank']#root rank info
with open(file_base+"__RDP_taxonomy_trained.txt", 'w') as output_file:
output_file.write("*".join(root)+"\n")
ID = 0 #taxon id
line = f.readline()
if format == "UNITE":
name_to_end = {}
if dup:
end_name_dict = {}
while line != "":
rec_count = 0
th_buf = "" # taxon header buffer
output_buf = "" # trained taxonomy buffer
while line != "" and rec_count < 10000: # Rec count to export when buffer is at 10000 records
rec_count += 1
acc = line.split('\t')[0]
cols = line.strip().split('\t')[1:]
header = F">{acc}\tRoot"
for i in range(len(cols)):#iterate each column
name = []
for node in cols[:i + 1]:
if not node == '-':
name.append(node)
pName = ";".join(name[:-1])
depth = len(name)
name = ";".join(name)
if name in hash: # Avoid repeated taxonomies
if name != prev_name:
prev_name = name
header += F";{name_to_end[name]}"
continue
prev_name = name
rank = ranks[i]
if i == 0:
pName = 'Root'
pID = hash[pName]#parent taxid
ID += 1
hash[name] = ID #add name-id to the map
end_name = name.split(';')[-1]
if dup:
if end_name not in end_name_dict:
end_name_dict[end_name] = 1
end_name = F"{end_name}_1"
else:
end_name_dict[end_name] += 1
end_name = F"{end_name}_{end_name_dict[end_name]}"
header = F"{header};{end_name}"
name_to_end[name] = end_name
output_buf = F"{output_buf}{ID}*{end_name}*{pID}*{depth}*{rank}\n"
th_buf = F"{th_buf}{header}\n"
line = f.readline()
with open(file_base+"__RDP_taxonomy_headers.txt", "a+") as taxon_headers:
taxon_headers.write(th_buf)
with open(file_base+"__RDP_taxonomy_trained.txt", "a+") as output_file:
output_file.write(output_buf)
else:
name_to_end = {}
end_name_dict = {}
while line != "":
rec_count = 0
th_buf = ""
output_buf = ""
while line != "" and rec_count < 10000:
rec_count += 1
acc = line.split('\t')[0]
cols = line.strip().split('\t')[1:]
header = F">{acc}\tRoot"
for i in range(len(cols)):#iterate each column
name = []
for node in cols[:i + 1]:
if not node == '-':
name.append(node)
pName = ";".join(name[:-1])
depth = len(name)
name = ";".join(name)
if name in hash:
if name != prev_name:
prev_name = name
header += F";{name_to_end[name]}"
continue
prev_name = name
rank = ranks[i]
if i == 0:
pName = 'Root'
pID = hash[pName]#parent taxid
ID += 1
hash[name] = ID #add name-id to the map
end_name = name.split(';')[-1]
# Allow for taxa which have more than 1 parent lineage
if end_name not in end_name_dict:
end_name_dict[end_name] = 1
end_name = F"{end_name}_1"
else:
end_name_dict[end_name] += 1
end_name = F"{end_name}_{end_name_dict[end_name]}"
header = F"{header};{end_name}"
name_to_end[name] = end_name
output_buf = F"{output_buf}{ID}*{end_name}*{pID}*{depth}*{rank}\n"
th_buf = F"{th_buf}{header}\n"
line = f.readline()
with open(file_base+"__RDP_taxonomy_headers.txt", "a+") as taxon_headers:
taxon_headers.write(th_buf)
print("Headers exported")
with open(file_base+"__RDP_taxonomy_trained.txt", "a+") as output_file:
output_file.write(output_buf)
print("Trained taxonomy exported")