-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetReadAssignement.py
executable file
·46 lines (37 loc) · 1.31 KB
/
getReadAssignement.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# coding=utf-8
# Search read if in G2R file and print a TSV output containing gene and transcript associated to this read
# input format plain text for the list of reads, one read id per line,
# and g2rl for the gene/ read map
# (easier / lighter to store / move around than a bam file and a GTF)
import igraph as ig
import sys
listFile = sys.argv[1]
g2rFile = sys.argv[2]
readList = []
def parsg2r(g2rFile):
readTag = {}
with open(g2rFile, 'r') as g2rf:
currentGene = ""
currentTrans = ""
for line in g2rf:
value = line.split("\t")[1].rstrip("\n")
if(line[0] == "G"):
currentGene = value
elif(line[0] == "T"):
currentTrans = value
elif(line[0] == "R"):
read = value
if(read in readList):
print(read, currentTrans, currentGene, sep = "\t")
readList.pop(readList.index(read))
readTag[read] = currentGene
return(readTag)
with open(listFile, "r") as f:
readSet = set() # used to avoid redundant read names.
for line in f:
readSet.add(line.rstrip("\n"))
readList = list(readSet)
parsg2r(g2rFile)
if(len(readList) != 0 ):
for elem in readList:
print(elem, "UNASSIGNED", "UNASSIGNED", sep = "\t")