Skip to content

Commit

Permalink
Merge pull request #110 from yhoogstrate/exonic_intronic_integrate
Browse files Browse the repository at this point in the history
Exonic intronic integrate
  • Loading branch information
yhoogstrate authored Feb 2, 2018
2 parents 0eb122b + 0097330 commit f78b484
Show file tree
Hide file tree
Showing 20 changed files with 106 additions and 74 deletions.
7 changes: 7 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2018-02-01 Youri Hoogstrate v0.16.0
* Adds the exons before the break and exons after the break to the
integrate results. Remark that the offset to match an exon is
[-2,+2] because STAR may introduce slight offset errors.
* Fixes bug in integrate where rather similar fusions in opposite
strands were still bundled together.

2018-02-01 Youri Hoogstrate v0.15.2
* Makes mismatches per base classification more stringent
unless `--ffpe` is used. Using `--ffpe` the old behaviour
Expand Down
4 changes: 2 additions & 2 deletions bin/dr-disco
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi
@CLI.command(name='integrate', short_help='Maps junctions back together that are likely to correspond to the same fusion event.')
@click.argument('table_input_file', type=click.Path(exists=True))
@click.argument('table_output_file')
@click.option('--gtf', help="Use gene annotation (GTF file)")
@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs")
@click.option('--gtf', help="Use gene annotation for estimating fusion genes and improve classification of exonic (GTF file)")
@click.option('--fasta', help="Use reference sequences to estimate edit distances to splice junction motifs (FASTA file)")
def CLI_integrate(table_input_file, table_output_file, gtf, fasta):
cl = DetectOutput(table_input_file)

Expand Down
2 changes: 2 additions & 0 deletions drdisco/ChimericAlignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,8 @@ def randstr(n):
if len(alignments) > 0:
self.reconstruct_alignments(alignments, sam_file_discordant, fh)
else:
os.remove(basename + ".name-sorted.bam")
os.remove(basename + ".name-sorted.fixed.sam")
err = "No reads were found, fixing empty sam/bam file: " + self.input_alignment_file
log.error(err)
raise Exception(err)
Expand Down
7 changes: 5 additions & 2 deletions drdisco/DetectFrameShifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def load_gtf_per_transcript():
transcript_idx[transcript_id][exon_number][gtf_type] = feature

except KeyError:
log.warn("Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection.")
log.warn("Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection. Ensembl GTF files are known to be compatible.")
# there is no GFF_Reader.close() so break it dirty:
break

Expand Down Expand Up @@ -229,6 +229,9 @@ def evaluate(self, _from, _to, offset):

results = {0: [], 1: [], 2: [], 'fgd': []}

all_from = sorted(list(set([_ for _ in from_l_fgd] + [_[0] for _ in from_l])))
all_to = sorted(list(set([_ for _ in to_l_fgd] + [_[0] for _ in to_l])))

for from_l_i in from_l_fgd:
for to_l_i in to_l_fgd:
results['fgd'].append((from_l_i, to_l_i))
Expand All @@ -238,4 +241,4 @@ def evaluate(self, _from, _to, offset):
frame_shift = ((from_l_i[1] + to_l_i[1]) % 3)
results[frame_shift].append((from_l_i, to_l_i))

return results
return all_from, all_to, results
29 changes: 20 additions & 9 deletions drdisco/DetectOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ def parse(self):

self.edit_dist_to_splice_motif = ""

self.exons_from = []
self.exons_to = []

self.structure = self.line[46]

inv = {'-': '+', '+': '-'}
Expand Down Expand Up @@ -464,12 +467,12 @@ def insert_in_index(index, entries, score, i):
if score not in index:
index[score] = {}

key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i)
key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i)
index[score][key] = entries

with open(output_table, 'w') as fh_out:
header = self.header.split("\t")
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:])
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance', "exons from (5')", "exons to (3')"] + header[-5:])

fh_out.write("shared-id\tfusion\t" + header)

Expand All @@ -492,9 +495,9 @@ def insert_in_index(index, entries, score, i):
done_breaks = set([])

if e.donorA > e.donorB:
frame_shifts = dfs.evaluate([e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2)
exons_from, exons_to, frame_shifts = dfs.evaluate([e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2)
exons_from, exons_to, frame_shifts = dfs.evaluate([e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2)

done_breaks.add(e.chrA + ':' + str(e.posA) + '/' + str(e.posA + 1) + '(' + e.strandA + ')->' + e.chrB + ':' + str(e.posB) + '/' + str(e.posB + 1) + '(' + e.strandB + ')')

Expand All @@ -513,9 +516,13 @@ def insert_in_index(index, entries, score, i):

if params[0] not in done_breaks and n_split_reads > 0:
if e.donorA > e.donorB: # nice, use same thing to swap if necessary
frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
exons_from_, exons_to_, frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
exons_from_, exons_to_, frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)

exons_from += exons_from_
exons_to += exons_to_
del(exons_from_, exons_to_)

fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
Expand All @@ -524,10 +531,15 @@ def insert_in_index(index, entries, score, i):

done_breaks.add(params[0])

e.exons_from = sorted(list(set(exons_from)))
e.exons_to = sorted(list(set(exons_to)))
del(exons_from, exons_to)

e.fgd = ','.join(sorted(list(set(fgd))))
e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))
del(fgd, frameshifts_0, frameshifts_1, frameshifts_2)

if ffs:
e.is_on_splice_junction_motif(ffs)
Expand Down Expand Up @@ -581,7 +593,7 @@ def insert(pos, e):

top_result = (None, 9999999999999)
for r in sorted(results.keys()):
if results[r] >= 2:
if results[r] >= 2 and r.strandA == e.strandA and r.strandB == e.strandB:
d1 = (r.posA - e.posA)
d2 = (r.posB - e.posB)
sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))
Expand All @@ -596,7 +608,6 @@ def insert(pos, e):
insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q)
else:
insert_in_index(idx2, [e], e.score, q)

q += 1

for e in remainder:
Expand All @@ -613,7 +624,7 @@ def insert(pos, e):
for entry in idx2[score][key]:
if entry not in exported:
acceptors_donors = entry.get_donors_acceptors(gene_annotation)
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:]
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif, ",".join(entry.exons_from), ",".join(entry.exons_to)] + entry.line[-5:]

fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n")
exported.add(entry)
Expand Down
2 changes: 1 addition & 1 deletion drdisco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import logging
import sys

__version_info__ = ('0', '15', '2')
__version_info__ = ('0', '16', '0')
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
__author__ = 'Youri Hoogstrate'
__homepage__ = 'https://github.com/yhoogstrate/dr-disco'
Expand Down
4 changes: 2 additions & 2 deletions tests/chim_overhang/test_01_integrate.out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance exons from (5') exons to (3') median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
4 changes: 2 additions & 2 deletions tests/chim_overhang/test_02_integrate.out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr5:105166412->chr7:12213276 chr5 105166412 + 0 22 chr7 12213276 + 22 0 inf entropy=0.7372<0.7382,chim_overhang=16<25 linear intronic 33 22 11 0 1528 14 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.1273 15.0000 0.8367 0.0013 0.0278 0.6091 53.9545 0.9419 0.0000 0.0724 0.0000 0.3333 2.0000 16 57 16 59 chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11)
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance exons from (5') exons to (3') median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr5:105166412->chr7:12213276 chr5 105166412 + 0 22 chr7 12213276 + 22 0 inf entropy=0.7372<0.7382,chim_overhang=16<25 linear intronic 33 22 11 0 1528 14 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.1273 15.0000 0.8367 0.0013 0.0278 0.6091 53.9545 0.9419 0.0000 0.0724 0.0000 0.3333 2.0000 16 57 16 59 chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11)
4 changes: 2 additions & 2 deletions tests/chim_overhang/test_03_integrate.out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr8:86472668->chr17:40685761 chr8 86472668 - 6 8 chr17 40685761 + 8 6 inf chim_overhang=18<25 linear intronic 21 8 7 0 998 6 1 1 1 0 0 0.8277 0.8277 0.6547 0.0000 0.3214 53.7500 0.9186 0.0035 0.0619 0.2143 16.9286 0.8660 0.0117 0.0553 0.0000 0.1905 2.0000 55 18 56 18 chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4)
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance exons from (5') exons to (3') median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr8:86472668->chr17:40685761 chr8 86472668 - 6 8 chr17 40685761 + 8 6 inf chim_overhang=18<25 linear intronic 21 8 7 0 998 6 1 1 1 0 0 0.8277 0.8277 0.6547 0.0000 0.3214 53.7500 0.9186 0.0035 0.0619 0.2143 16.9286 0.8660 0.0117 0.0553 0.0000 0.1905 2.0000 55 18 56 18 chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4)
Loading

0 comments on commit f78b484

Please sign in to comment.