-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgff2fa.py
executable file
·111 lines (95 loc) · 4.44 KB
/
gff2fa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
from optparse import OptionParser
import gzip, os, pdb, sys
import dna, gff
################################################################################
# gff2fa.py
#
# Given a gff file and fasta file or chromosome fasta file directory, produce
# a fasta file of the gff entries.
################################################################################
################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] <gff file>'
parser = OptionParser(usage)
parser.add_option('-c', dest='chr_dir', default='', help='Directory of chromosome files named according to the first column of the gff file')
parser.add_option('-f', dest='fasta_file', default='%s/research/common/data/genomes/hg19/sequence/hg19.fa' % os.environ['HOME'], help='Fasta file with entries named according to the first column of the gff file [Default: %default]')
parser.add_option('-g', dest='gene_too', default=False, action='store_true', help='Print transcript id and gene id [Default: %default]')
#parser.add_option('--gtf', dest='gtf', action='store_true', default=False, help='Input file is gtf and linked entries should be combined into a single sequence [Default: %default]')
parser.add_option('--head', dest='header_key', default=None, help='GFF key to be used to merge GFF entries and label the fasta headers [Default: %default]')
parser.add_option('-s', dest='split_lines', default=None, action='store_true', help='Split sequence across multiple lines [Default: %default]')
parser.add_option('-x', dest='exon', action='store_true', default=False, help='Only include exon rows [Default: %default]')
(options,args) = parser.parse_args()
if len(args) != 1:
parser.error('Must provide gff file')
else:
gff_file = args[0]
if options.fasta_file:
fasta_files = [options.fasta_file]
elif options.chr_dir:
gff_chrs = set([line.split()[0] for line in open(gff_file)])
fasta_files = []
for chrom in gff_chrs:
fasta_files += glob.glob(options.chr_dir+'/%s*' % chrom)
else:
parser.error('Must provide fasta source')
for fasta_file in fasta_files:
if fasta_file[-3:] == '.gz':
fasta_open = gzip.open(fasta_file)
else:
fasta_open = open(fasta_file)
header = ''
line = fasta_open.readline()
while line:
if line[0] == '>':
if header:
header_gff(header, seq, gff_file, options)
header = line[1:].split()[0]
seq = ''
else:
seq += line.rstrip()
line = fasta_open.readline()
header_gff(header, seq, gff_file, options)
################################################################################
# header_gff
#
# Print sequence features for the given header and seq from the given gff file
################################################################################
def header_gff(header, seq, gff_file, options):
header_seqs = {}
for line in open(gff_file):
a = line.split('\t')
a[-1] = a[-1].rstrip()
if (not options.exon or a[2] == 'exon') and a[0] == header:
try:
kv = gff.gtf_kv(a[8])
except:
kv = {}
head_id = kv.get(options.header_key,a[0]+':'+a[3]+'-'+a[4])
#head_id = kv.get(options.header_key,a[8])
if options.gene_too:
head_id += ' gene=%s' % kv.get('gene_id','')
feat_start = int(a[3])
feat_end = int(a[4])
feat_seq = seq[feat_start-1:feat_end]
if a[6] == '+':
header_seqs[head_id] = header_seqs.get(head_id,'') + feat_seq
else:
header_seqs[head_id] = dna.rc(feat_seq) + header_seqs.get(head_id,'')
for header in header_seqs:
print '>%s' % header
if options.split_lines:
i = 0
while i < len(header_seqs[header]):
print header_seqs[header][i:i+60]
i += 60
else:
print header_seqs[header]
################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()
#pdb.runcall(main)