-
Notifications
You must be signed in to change notification settings - Fork 2
/
FormatRefDB.py
163 lines (134 loc) · 5.82 KB
/
FormatRefDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# Written by Natalie Vande Pol
# November 15, 2017
# Modified by Julian Liber
# May 2020-2022
# Command line: python MasterScript1.py
# Script will locate, import, and validate the database file specified
# Requires subscript_lineage2taxonomyTrain.py and subscript_fasta_addFullLineage.py
# be in the same directory
# Creates the following files in the input file location:
# - a combined taxonomy and fasta database file compatible with the UTAX and
# SINTAX classifiers
# - fasta and taxonomy database files compatible with the RDP classifer
# - training files for the RDP classifier
# Output files are named and located based on the input file
# -*- coding: utf-8 -*-
import sys, os, unicodedata, argparse, glob, time
def convert_utax_line(utax_taxa):
r_lets = "dkpcofgs"
new_taxa = []
for i in range(min(len(utax_taxa), len(r_lets)) - 1):
new_taxa.append(F"{r_lets[i]}:{utax_taxa[i]}")
if utax_taxa[-1].count("_") > 1:
new_taxa.append(F"{r_lets[-1]}:{utax_taxa[-1]}")
else:
new_taxa.append(F"{r_lets[min(len(utax_taxa), len(r_lets)) - 1]}:{utax_taxa[-1]}")
return ",".join(new_taxa)
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--db", type=str, help="database file")
parser.add_argument("-t", "--tf", type=str, help="training files path")
parser.add_argument("-f", "--format", type=str, help="database formatting")
parser.add_argument("-p", "--path", type=str, help="path to subscript imports")
parser.add_argument("--dup", action='store_true')
args = parser.parse_args()
sys.path.append(args.path + "/")
print(F"Importing subscripts from {args.path}")
import subscript_lineage2taxonomyTrain, subscript_fasta_addFullLineage
# silva = "null"
filename = args.db
filename_base = args.tf + "/" + ".".join(os.path.basename(filename).split(".")[:-1])
print("\n____________________________________________________________________\nReformatting database\n")
start = time.process_time()
#RDP output files
fasta = open(filename_base+"__RDP.fasta","w")
taxon_fn = filename_base+"__RDP_taxonomy.txt"
taxon = open(taxon_fn,"w")
print(F"{args.format} format detected\n")
if args.format == "UNITE":
taxon.write("Seq_ID\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\n")
num = 0
with open(filename) as database:
for line in database:
if line[0] == ">":
#correct umlauts or special letters
ascii_line = unicodedata.normalize('NFKD', line).encode('ASCII', 'ignore')
temp = ascii_line.decode()[1:].split("|")
#RDP files
name = str(temp[1])
temp2 = temp[-1].strip().split("__") # take last element from the "|" split
to_genus = [ item[:-2] for item in temp2[1:-1] ]
if "Incertae_sedis" in to_genus:
indices = [i for i,x in enumerate(to_genus) if x == "Incertae_sedis"]
for j in indices:
if "Incertae_sedis" not in to_genus[j-1]:
to_genus[j] = str(to_genus[j-1])+"_Incertae_sedis"
else:
to_genus[j] = str(to_genus[j-1])
if "unidentified" in to_genus:
indices = [i for i,x in enumerate(to_genus) if x == "unidentified"]
for j in indices:
to_genus[j] = "-"
if to_genus[0] != "-":
species = str(temp2[-1])
if "Incertae" in species:
species = "unidentified_sp"
elif to_genus[-1] not in species:
temp=species.split("_")
species = temp[0]+"_unidentified_"+temp[1]
if species.endswith("sp"):
species+= "_"+str(num)
num += 1
taxonomy = name+"\t"+"\t".join(to_genus)+"\t"+species+"\n"
fasta.write(">"+name+"\n")
taxon.write(taxonomy)
seq = next(database)
fasta.write(seq)
fasta.close()
taxon.close()
else:
max_rank = 1
with open(filename, "r") as database:
line = database.readline()
while line != "":
t_list = line.strip().split(";")
if len(t_list) > max_rank:
max_rank = len(t_list)
line = database.readline()
taxon.write("Seq_ID\t" + "\t".join([F"Rank_{x}" for x in range(1, max_rank+1)]) + "\n")
with open(filename, "r") as database:
line = database.readline()
while line != "":
line = line.replace(" Bacteria;", "?Bacteria;").replace(" Eukaryota;", "?Eukaryota;").replace(" Archaea;", "?Archaea;").replace(" ", "_")
ascii_line = unicodedata.normalize('NFKD', line).encode('ASCII', 'ignore')
# temp = ascii_line.decode().replace("*", "_").replace("'", "").replace(",", "").replace("Oral_Taxon", "oral_taxon")[1:].split("?")
temp = ascii_line.decode().translate(str.maketrans("*,<>", "_ ")).replace("Oral_Taxon", "oral_taxon").replace("'","")[1:].split("?")
name = str(temp[0]).split(".")[0]
t_list = temp[1].strip().split(";")
if "unidentified" in t_list:
indices = [i for i,x in enumerate(t_list) if x == "unidentified"]
non_unid = [i for i,x in enumerate(t_list) if x != "unidentified"]
for j in indices:
t_list[j] = "-"
if t_list[-1] == "-": # if lowest rank in unidentified, add the lowest identified rank to the lowest taxa
t_list[-1] = t_list[non_unid[-1]] + "_unidentified"
if len(t_list) < max_rank:
t_list = t_list[:-1] + ["-"]*(max_rank - len(t_list)) + [t_list[-1]] # fill in missing ranks
taxonomy = name+"\t"+"\t".join(t_list)+"\n"
fasta.write(">"+name+"\n")
taxon.write(taxonomy)
line = database.readline()
seq = ""
while line != "" and line[0] != ">":
seq += line.strip()
line = database.readline()
seq = seq.replace("U", "T")
fasta.write(seq + "\n")
fasta.close()
taxon.close()
print(F"Reference database FASTAs formatted in {time.process_time() - start} seconds...\n")
os.system(F"rm {filename_base}__RDP_taxonomy_trained.txt 2> /dev/null")
os.system(F"rm {filename_base}__RDP_taxonomy_headers.txt 2> /dev/null")
subscript_lineage2taxonomyTrain.lin2tax(filename_base, args.format, args.dup)
subscript_fasta_addFullLineage.addFullLineage(filename_base, args.format)
print("Database formatting complete\n____________________________________________________________________\n\n")