-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_seq.py
60 lines (55 loc) · 1.48 KB
/
fetch_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/python
"""
"""
# IMPORT
from Bio import SeqIO
import sys
# FUNCTIONS
def parse_blast_out(gene):
blast_out = gene + ".txt"
f = open(gene+"_DC3000.fas","r")
seqs = list(SeqIO.parse(f,"fasta"))
f.close()
length = len(seqs[0].seq)
f = open(blast_out,"r")
lines = [i.strip().split("\t") for i in f.readlines()]
f.close()
alignment = {}
for line in lines:
subject = line[1]
start = int(line[8])
end = int(line[9])
if subject not in alignment and int(subject) <= 410 and int(line[3])==length:
alignment[subject] = [start,end]
else:
continue
return alignment
def get_seq(alignment,db):
f = open(db,"r")
seqs = list(SeqIO.parse(f,"fasta"))
f.close()
seqs_dict = {seq.id:seq.seq for seq in seqs}
genome_ids = alignment.keys()
genes = []
for id in genome_ids:
start = alignment[id][0]
end = alignment[id][1]
if end > start:
seq = str(seqs_dict[id][start-1:end])
else:
seq = str(seqs_dict[id][end-1:start].reverse_complement())
genes.append([id,seq])
return genes
def fetch_seq(gene,db):
alignment = parse_blast_out(gene=gene)
genes = get_seq(alignment=alignment,db=db)
f = open(gene+"_align.fasta","w")
for i in genes:
f.write(">"+i[0]+"\n")
f.write(i[1]+"\n")
f.close()
# MAIN
if __name__ == '__main__':
gene = sys.argv[1]
db = sys.argv[2]
fetch_seq(gene,db)