-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenome_Composition.py
128 lines (97 loc) · 3.6 KB
/
Genome_Composition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
import Header
from Header import *
##---------------------------------------------------------------------------------
''' this function is useful to parse various options for input data processing '''
##---------------------------------------------------------------------------------
def parse_options():
parser = OptionParser()
parser.add_option("-D", "--INPDIR", \
type="string", \
action="store", \
dest="INP_DIR", \
default="", \
help="path of the input directory containing all species genome in subdirectory")
parser.add_option("-O", "--OUTDIR", \
type="string", \
action="store", \
dest="OUT_DIR", \
default="", \
help="name of the output directory containing all output files in subdirectory (INP_DIR)")
parser.add_option("-I", "--INPSEQUENCEFILE", \
type="string", \
action="store", \
dest="INP_SEQUENCE_FILE", \
default="", \
help="name of the file containing the sequence of the species (all should be same name)")
parser.add_option("-H", "--REVSTRAND", \
action="store_true", \
dest="REV_STRAND", \
default=False, \
help="If 0, it computes the GFP for forward strands \
Else, it will computes the GFP for reverse strands")
opts, args = parser.parse_args()
return opts, args
#---------------------------------------------------------------------------------------
def main():
opts, args = parse_options()
INP_DIR = opts.INP_DIR
OUT_DIR = opts.OUT_DIR
INP_SEQUENCE_FILE = opts.INP_SEQUENCE_FILE
REV_STRAND = opts.REV_STRAND
if REV_STRAND == 1:
REV_STRAND = True
else:
REV_STRAND = False
dataset = INP_DIR.split('/')[-1]
if (os.path.isdir(OUT_DIR) == False):
mkdr_cmd = 'mkdir -p ' + OUT_DIR
os.system(mkdr_cmd)
OUT_FILE = OUT_DIR + '/' + dataset + '.csv'
fp_write = open(OUT_FILE,'w')
fp_write.write('Species Name,Accession Number,Sequence Length,A%,T%,G%,C%,Unrecognized%,AT%,GC%,AT skew, GC skew\n')
rootDir = INP_DIR
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
if fname == INP_SEQUENCE_FILE:
input_file = dirName + '/' + INP_SEQUENCE_FILE
#print'\nFound file: ', input_file
species_name = dirName.split('/')[-1]
gene_sequences = SeqIO.parse(open(input_file),'fasta')
A_count = 0
T_count = 0
G_count = 0
C_count = 0
others = 0
for gene in gene_sequences:
seq_id = gene.id
sequence=gene.seq
if REV_STRAND:
sequence = sequence.reverse_complement()
seq_len = len(sequence)
for i in range(seq_len):
if sequence[i] == 'A':
A_count += 1
elif sequence[i] == 'T':
T_count += 1
elif sequence[i] == 'G':
G_count += 1
elif sequence[i] == 'C':
C_count += 1
else:
others += 1
A_percent = A_count*100.0/seq_len
T_percent = T_count*100.0/seq_len
G_percent = G_count*100.0/seq_len
C_percent = C_count*100.0/seq_len
others_percent = others*100.0/seq_len
AT_percent = (A_count+T_count)*100.0/seq_len
GC_percent = (G_count+C_count)*100.0/seq_len
AT_skew = (A_count-T_count)*1.0/(A_count+T_count)
GC_skew = (G_count-C_count)*1.0/(G_count+C_count)
fp_write.write(species_name+','+seq_id+','+str(seq_len)+','+str(A_percent)+','+str(T_percent)+','+str(G_percent)+','+str(C_percent)+','+str(others_percent)+','+str(AT_percent)+','+str(GC_percent)+','+str(AT_skew)+','+str(GC_skew)+'\n')
print 'The Genome Composition is written at: ',OUT_FILE
fp_write.close()
#---------------------------------------------------------------------------------------
if __name__=='__main__':
main()