-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathortho_end_to_fasta.py
executable file
·104 lines (90 loc) · 3.55 KB
/
ortho_end_to_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python2.7
# this script creates fasta files of protein or nucleotide sequences
# from the .end output file of FastOrtho
# Sarah Guermond
# 18 June 2014
import io
import sys
import re
import os
from collections import defaultdict
from Bio import SeqIO
if len(sys.argv) < 5:
print("\n\tThis script creates fasta files of protein or nucleotide sequences")
print("\tfrom the .end output file of FastOrtho.")
print("\n\tUsage: ./ortho_end_to_fasta.py <ortho.end> <in format> <list>")
print("\n\t<ortho.end>: FastOrtho .end file")
print("\t<in format>: file extension of FastOrtho input file (pep, fasta...)")
print("\t<num taxa>: minimum number of taxa desired in ortholog")
print("\t<list>: tab-separated list of reference nuc/pep files, FastOrtho input file, and species prefix")
print("\t\tEx: \tDuncanopsammia_longest.cds\tDaxi.transdecoder.pep\tDaxi\n")
quit()
orthologs = sys.argv[1]
extension = sys.argv[2]
min_taxa = sys.argv[3]
info_file_list = sys.argv[4]
### to do: automatically move new fasta files to new dir
# make new dir ortho_fasta to create fasta files in
# os.makedirs("ortho_fasta")
# read list of species info
#### reference transdecoder species
info_handle = io.open(info_file_list, "rb")
info_files = []
info = {}
for line in info_handle:
info_files_list = line.strip().split("\t")
info = {"ref_file": info_files_list[0], "fo_file": info_files_list[1], "species_prefix": info_files_list[2]}
info_files.append(info)
info_handle.close()
# dictionary: species.seqname -> ortholog
ortho_handle = io.open(orthologs, "rb")
species_seqname_to_ortho = {}
for line in ortho_handle:
# get number of taxa
line_list = line.strip().split("\t")
ortho_header = line_list[0]
ortho_header_split = ortho_header.split(" ")
gene_taxa = ortho_header_split[2]
num_taxa_result = re.subn("genes,", "", gene_taxa, 0)
num_taxa_split = num_taxa_result[0].strip().split(" ")
num_taxa = num_taxa_split[0]
# skip to next if too few taxa in ortholog
if num_taxa < min_taxa:
continue
# get ortholog name
ortho = ortho_header_split[0]
# get species filenames, seqs
seq_line = line_list[1].strip()
seq_species = seq_line.split(" ")
for seq_sp in seq_species:
result = re.subn("[\(\)]", " ", seq_sp, 0)
seq_sp_split = result[0].strip().split(" ")
seq_name = seq_sp_split[0]
species_file = seq_sp_split[1] + "." + extension
# search info_list for same FastOrtho file to get species
item_info = (item for item in info_files if item["fo_file"] == species_file).next()
# make key
species_seq = item_info['species_prefix'] + "." + seq_name
species_seqname_to_ortho[species_seq] = ortho
ortho_handle.close()
# for seq in reference file
for species in info_files:
infile = open(species['ref_file'], "rU")
for record in SeqIO.parse(infile, "fasta"):
# get seqname
seqid_raw = record.id
seqid_list = seqid_raw.strip().split('\s')
seqid = seqid_list[0]
seq = str(record.seq)
# recover species.seqname
species_seqname = species['species_prefix'] + "." + seqid
# find it in the dictionary, get associated ortholog
if species_seqname in species_seqname_to_ortho:
# change | in seqid to _
seqid_result = re.subn("\|", "_", seqid_pipe, 0)
seqid_split = seqid_result[0].strip().split(" ")
seqid = seqid_split[0]
# get ortholog
ortho = species_seqname_to_ortho.get(species_seqname)
with open(ortho + '.fasta', 'a') as outfile:
outfile.write('>' + species['species_prefix'] + '_' + seqid + '\n' + seq + '\n')