-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_blast_qcov.py
59 lines (51 loc) · 2.13 KB
/
filter_blast_qcov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 18 11:20:06 2019
@author: YudongCai
@Email: [email protected]
"""
import click
import numpy as np
@click.command()
@click.option('--blastout', help='blast输出结果, 最后一行#开头')
@click.option('--outprefix', help='过滤后的blast结果')
def main(blastout, outprefix):
"""
对于每个query,选取query coverage per subject(qcovs)最大的相应结果
blast输入格式要求outfmt 7 std,外加一个qcovs就可以,可以再增加其他的字段
比如为 -outfmt '7 std qcovs qcovhsp qcovus'
"""
records = []
qcovs = []
with open(blastout) as f1, open(f'{outprefix}.qcovs.tsv', 'w') as f2, open(f'{outprefix}.qstat.tsv', 'w') as f3:
for line in f1:
if line[0] != '#':
records.append(line)
qcov_index = header.index('% query coverage per subject')
qcov = int(line.strip().split()[qcov_index])
qcovs.append(qcov)
else:
f2.write(line)
if line[2:7] == 'Query':
print(line.strip('#'), end='')
query = line.strip().split()[-1]
if line[2:8] == 'Fields':
header = [x.strip() for x in line[10:].split(',')]
print(header)
if records:
qcovs = np.array(qcovs)
max_qcov = qcovs.max()
lens = []
idents = []
for record, ismax in zip(records, qcovs==max_qcov):
if ismax:
f2.write(record)
tline = record.split()
lens.append(int(tline[header.index('alignment length')]))
idents.append(float(tline[header.index('% identity')]))
ident = np.average(idents, weights=lens)
f3.write(f'{query}\t{ident}\t{max_qcov}\n')
records = []
qcovs = []
if __name__ == '__main__':
main()