-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrans2gene.py
executable file
·43 lines (37 loc) · 1.16 KB
/
trans2gene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# coding=utf-8
# convert transcript tag to appropriate gene tag
import sys
import os
import re
transcript_rule = re.compile(r'ENSMUST\d+\.?\d*')
to_convert_file = sys.argv[1]
path = os.path.dirname(to_convert_file)
file_base = os.path.basename(to_convert_file)
converted_file = os.path.join(path, "gene_tag" + file_base)
g2t_file = sys.argv[2]
# associating a gene to a transcript
t2g = {}
current_gene = ""
with open(g2t_file, 'r') as f:
for line in f:
line = line.rstrip("\n")
t, value = line.split("\t")
if(t == "G"):
current_gene = value
else:
t2g[value] = current_gene
# converting
out = open(converted_file, "w")
with open(to_convert_file, 'r') as f:
for line in f:
line = line.rstrip("\n")
newLine = line
for occ in transcript_rule.finditer(line):
transcript = occ.group(0)
shortened = transcript.split('.')[0]
if(shortened in t2g.keys()):
newLine = newLine.replace(transcript, t2g[transcript.split('.')[0]])
else:
print("TRANSCRIPT NOT IN TRANSCRIPTOM ?? ", transcript)
out.write(newLine)
out.close