-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsam2G2R.py
executable file
·102 lines (86 loc) · 3.46 KB
/
sam2G2R.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# coding=utf-8
#############################################################################
# #
# Associate reads to genes from a mapping on reference transcriptome #
# Works only for ENSEMBL formated cDNA database #
# #
# #
# #
# #
# Version: 0.3 #
# Author: Quentin Bonenfant #
#############################################################################
import pysam
import sys
import os
from collections import defaultdict as dd
SEP = "\t" # separator
def sam2graph(sam):
""" Use BAM/SAM file to convert mapping into a ref / read dictionnary"""
# storing gene/reads in default dictionnary, using set as factory.
mapping_dict = dd(set)
# using pysam to go through sam/bam files
with pysam.AlignmentFile(sam) as samfile:
for read in samfile.fetch():
if(not read.flag & 4):
# Fetching read and reference names
targetName = read.query_name
refName = read.reference_name
mapping_dict[refName].add(targetName)
return(mapping_dict)
def parseFasta(fastaFile):
"""Go through an ENSEMBL transcriptome fasta file
and associate transcript accession to gene accession"""
fasta = dd(set) # associate a gene to a set of transcripts
with open(fastaFile, 'r') as f:
for line in f:
if ">" == line[0]:
data = line[1:].split()
transcript_acc = data[0]
gene_acc = data[3].split(":")[1]
fasta[gene_acc].add(transcript_acc)
# not storing sequences
return(fasta)
if(len(sys.argv) != 3):
print("USAGE: sam2G2R FASTAFILE(ref) SAMFILE(mapping)")
print("OUTFILE ARE GENERATED IN THE SAME PATH AS SAMFILE")
exit()
fastaFile = sys.argv[1]
samFile = sys.argv[2]
# parsing fasta
print("PARSING FASTA")
fasta = parseFasta(fastaFile)
# parsing sam
print("PARSING SAM/BAM")
mapping_dict = sam2graph(samFile)
# risky way to remove ".sam" extension
filePath = os.path.basename(samFile)[:-4]
# appending our own extension
filePath += "_gene2readlist.txt"
out = open(filePath, "w")
print("OUTPUT FILE:")
print(filePath)
print("PROCESSING")
for gene in fasta.keys():
gene_block = "G" + SEP + gene + "\n"
trans_block = ""
print_gene = False # should we print the gene ? be printed ?
for trans in fasta[gene]:
try:
# try to fetch reads mapped to this transcript
reads = mapping_dict[trans]
if(len(reads) < 1):
raise KeyError
except KeyError:
# this will often not be the case, so pass if nothing is associated
pass
else:
trans_block += "T" + SEP + trans + "\n"
print_gene = True # Yes, reads are associated
for read in reads:
trans_block += "R" + SEP + read + "\n"
if(print_gene):
out.write(gene_block)
out.write(trans_block)
out.close()