Merge pull request #110 from yhoogstrate/exonic_intronic_integrate

Exonic intronic integrate
yhoogstrate · Feb 2, 2018 · f78b484 · f78b484
2 parents 0eb122b + 0097330
commit f78b484
Show file tree

Hide file tree

Showing 20 changed files with 106 additions and 74 deletions.
diff --git a/Changelog b/Changelog
@@ -1,3 +1,10 @@
+2018-02-01	Youri Hoogstrate	v0.16.0
+	 * Adds the exons before the break and exons after the break to the
+	   integrate results. Remark that the offset to match an exon is
+	   [-2,+2] because STAR may introduce slight offset errors.
+	 * Fixes bug in integrate where rather similar fusions in opposite
+	   strands were still bundled together.
+
 2018-02-01	Youri Hoogstrate	v0.15.2
 	 * Makes mismatches per base classification more stringent
 	   unless `--ffpe` is used. Using `--ffpe` the old behaviour

diff --git a/bin/dr-disco b/bin/dr-disco
@@ -140,8 +140,8 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi
 @CLI.command(name='integrate', short_help='Maps junctions back together that are likely to correspond to the same fusion event.')
 @click.argument('table_input_file', type=click.Path(exists=True))
 @click.argument('table_output_file')
-@click.option('--gtf', help="Use gene annotation (GTF file)")
-@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs")
+@click.option('--gtf', help="Use gene annotation for estimating fusion genes and improve classification of exonic (GTF file)")
+@click.option('--fasta', help="Use reference sequences to estimate edit distances to splice junction motifs (FASTA file)")
 def CLI_integrate(table_input_file, table_output_file, gtf, fasta):
     cl = DetectOutput(table_input_file)
 

diff --git a/drdisco/ChimericAlignment.py b/drdisco/ChimericAlignment.py
@@ -530,6 +530,8 @@ def randstr(n):
         if len(alignments) > 0:
             self.reconstruct_alignments(alignments, sam_file_discordant, fh)
         else:
+            os.remove(basename + ".name-sorted.bam")
+            os.remove(basename + ".name-sorted.fixed.sam")
             err = "No reads were found, fixing empty sam/bam file: " + self.input_alignment_file
             log.error(err)
             raise Exception(err)

diff --git a/drdisco/DetectFrameShifts.py b/drdisco/DetectFrameShifts.py
@@ -90,7 +90,7 @@ def load_gtf_per_transcript():
                             transcript_idx[transcript_id][exon_number][gtf_type] = feature
 
                     except KeyError:
-                        log.warn("Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection.")
+                        log.warn("Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection. Ensembl GTF files are known to be compatible.")
                         # there is no GFF_Reader.close() so break it dirty:
                         break
 
@@ -229,6 +229,9 @@ def evaluate(self, _from, _to, offset):
 
         results = {0: [], 1: [], 2: [], 'fgd': []}
 
+        all_from = sorted(list(set([_ for _ in from_l_fgd] + [_[0] for _ in from_l])))
+        all_to = sorted(list(set([_ for _ in to_l_fgd] + [_[0] for _ in to_l])))
+
         for from_l_i in from_l_fgd:
             for to_l_i in to_l_fgd:
                 results['fgd'].append((from_l_i, to_l_i))
@@ -238,4 +241,4 @@ def evaluate(self, _from, _to, offset):
                 frame_shift = ((from_l_i[1] + to_l_i[1]) % 3)
                 results[frame_shift].append((from_l_i, to_l_i))
 
-        return results
+        return all_from, all_to, results
diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py
@@ -125,6 +125,9 @@ def parse(self):
 
         self.edit_dist_to_splice_motif = ""
 
+        self.exons_from = []
+        self.exons_to = []
+
         self.structure = self.line[46]
 
         inv = {'-': '+', '+': '-'}
@@ -464,12 +467,12 @@ def insert_in_index(index, entries, score, i):
             if score not in index:
                 index[score] = {}
 
-            key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i)
+            key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i)
             index[score][key] = entries
 
         with open(output_table, 'w') as fh_out:
             header = self.header.split("\t")
-            header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:])
+            header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance', "exons from (5')", "exons to (3')"] + header[-5:])
 
             fh_out.write("shared-id\tfusion\t" + header)
 
@@ -492,9 +495,9 @@ def insert_in_index(index, entries, score, i):
                     done_breaks = set([])
 
                     if e.donorA > e.donorB:
-                        frame_shifts = dfs.evaluate([e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2)
+                        exons_from, exons_to, frame_shifts = dfs.evaluate([e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2)
                     else:
-                        frame_shifts = dfs.evaluate([e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2)
+                        exons_from, exons_to, frame_shifts = dfs.evaluate([e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2)
 
                     done_breaks.add(e.chrA + ':' + str(e.posA) + '/' + str(e.posA + 1) + '(' + e.strandA + ')->' + e.chrB + ':' + str(e.posB) + '/' + str(e.posB + 1) + '(' + e.strandB + ')')
 
@@ -513,9 +516,13 @@ def insert_in_index(index, entries, score, i):
 
                             if params[0] not in done_breaks and n_split_reads > 0:
                                 if e.donorA > e.donorB:  # nice, use same thing to swap if necessary
-                                    frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
+                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
                                 else:
-                                    frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
+                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
+
+                                exons_from += exons_from_
+                                exons_to += exons_to_
+                                del(exons_from_, exons_to_)
 
                                 fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
                                 frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
@@ -524,10 +531,15 @@ def insert_in_index(index, entries, score, i):
 
                             done_breaks.add(params[0])
 
+                    e.exons_from = sorted(list(set(exons_from)))
+                    e.exons_to = sorted(list(set(exons_to)))
+                    del(exons_from, exons_to)
+
                     e.fgd = ','.join(sorted(list(set(fgd))))
                     e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
                     e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
                     e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))
+                    del(fgd, frameshifts_0, frameshifts_1, frameshifts_2)
 
                 if ffs:
                     e.is_on_splice_junction_motif(ffs)
@@ -581,7 +593,7 @@ def insert(pos, e):
 
                 top_result = (None, 9999999999999)
                 for r in sorted(results.keys()):
-                    if results[r] >= 2:
+                    if results[r] >= 2 and r.strandA == e.strandA and r.strandB == e.strandB:
                         d1 = (r.posA - e.posA)
                         d2 = (r.posB - e.posB)
                         sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))
@@ -596,7 +608,6 @@ def insert(pos, e):
                     insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q)
                 else:
                     insert_in_index(idx2, [e], e.score, q)
-
                 q += 1
 
             for e in remainder:
@@ -613,7 +624,7 @@ def insert(pos, e):
                     for entry in idx2[score][key]:
                         if entry not in exported:
                             acceptors_donors = entry.get_donors_acceptors(gene_annotation)
-                            line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:]
+                            line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif, ",".join(entry.exons_from), ",".join(entry.exons_to)] + entry.line[-5:]
 
                             fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n")
                             exported.add(entry)

diff --git a/drdisco/__init__.py b/drdisco/__init__.py
@@ -31,7 +31,7 @@
 import logging
 import sys
 
-__version_info__ = ('0', '15', '2')
+__version_info__ = ('0', '16', '0')
 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
 __author__ = 'Youri Hoogstrate'
 __homepage__ = 'https://github.com/yhoogstrate/dr-disco'

diff --git a/tests/chim_overhang/test_01_integrate.out.txt b/tests/chim_overhang/test_01_integrate.out.txt
@@ -1,2 +1,2 @@
-shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
-1	chr2:17281790->chr11:5566704	chr2	17281790	+	0	22	chr11	5566704	-	22	0	inf	entropy=0.7372<0.7382,chim_overhang=21<25	linear	intronic	33	22	11	0	1530	10	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.3818	18.0000	0.8367	0.0013	0.0833	2.0455	38.7727	0.9652	0.0000	0.1847	0.0000	0.3333	2.0000						21	49	21	57	chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
+shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	exons from (5')	exons to (3')	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
+1	chr2:17281790->chr11:5566704	chr2	17281790	+	0	22	chr11	5566704	-	22	0	inf	entropy=0.7372<0.7382,chim_overhang=21<25	linear	intronic	33	22	11	0	1530	10	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.3818	18.0000	0.8367	0.0013	0.0833	2.0455	38.7727	0.9652	0.0000	0.1847	0.0000	0.3333	2.0000								21	49	21	57	chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
diff --git a/tests/chim_overhang/test_02_integrate.out.txt b/tests/chim_overhang/test_02_integrate.out.txt
@@ -1,2 +1,2 @@
-shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
-1	chr5:105166412->chr7:12213276	chr5	105166412	+	0	22	chr7	12213276	+	22	0	inf	entropy=0.7372<0.7382,chim_overhang=16<25	linear	intronic	33	22	11	0	1528	14	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.1273	15.0000	0.8367	0.0013	0.0278	0.6091	53.9545	0.9419	0.0000	0.0724	0.0000	0.3333	2.0000						16	57	16	59	chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11)
+shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	exons from (5')	exons to (3')	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
+1	chr5:105166412->chr7:12213276	chr5	105166412	+	0	22	chr7	12213276	+	22	0	inf	entropy=0.7372<0.7382,chim_overhang=16<25	linear	intronic	33	22	11	0	1528	14	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.1273	15.0000	0.8367	0.0013	0.0278	0.6091	53.9545	0.9419	0.0000	0.0724	0.0000	0.3333	2.0000								16	57	16	59	chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11)
diff --git a/tests/chim_overhang/test_03_integrate.out.txt b/tests/chim_overhang/test_03_integrate.out.txt
@@ -1,2 +1,2 @@
-shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
-1	chr8:86472668->chr17:40685761	chr8	86472668	-	6	8	chr17	40685761	+	8	6	inf	chim_overhang=18<25	linear	intronic	21	8	7	0	998	6	1	1	1	0	0	0.8277	0.8277	0.6547	0.0000	0.3214	53.7500	0.9186	0.0035	0.0619	0.2143	16.9286	0.8660	0.0117	0.0553	0.0000	0.1905	2.0000						55	18	56	18	chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4)
+shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	exons from (5')	exons to (3')	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
+1	chr8:86472668->chr17:40685761	chr8	86472668	-	6	8	chr17	40685761	+	8	6	inf	chim_overhang=18<25	linear	intronic	21	8	7	0	998	6	1	1	1	0	0	0.8277	0.8277	0.6547	0.0000	0.3214	53.7500	0.9186	0.0035	0.0619	0.2143	16.9286	0.8660	0.0117	0.0553	0.0000	0.1905	2.0000								55	18	56	18	chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4)