Skip to content

Commit

Permalink
added support for fusion novelty type in the abundance utility
Browse files Browse the repository at this point in the history
  • Loading branch information
fairliereese committed Oct 18, 2023
1 parent 74dbc10 commit 934d2f8
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 3 deletions.
10 changes: 10 additions & 0 deletions src/talon/post/create_abundance_file_from_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
# Look for gene type
if gene_ID in novelty_type.antisense_genes:
curr_novel["gene_novelty"] = "Antisense"
elif gene_ID in novelty_type.fusion_genes:
curr_novel["gene_novelty"] = 'Fusion'
elif gene_ID in novelty_type.intergenic_genes:
curr_novel["gene_novelty"] = "Intergenic"
elif gene_ID in novelty_type.known_genes:
Expand All @@ -317,6 +319,8 @@ def get_gene_and_transcript_novelty_types(gene_ID, transcript_ID, novelty_type):
curr_novel["transcript_novelty"] = "Intergenic"
elif transcript_ID in novelty_type.genomic_transcripts:
curr_novel["transcript_novelty"] = "Genomic"
elif transcript_ID in novelty_type.fusion_transcripts:
curr_novel["transcript_novelty"] = "Fusion"
elif transcript_ID in novelty_type.known_transcripts:
curr_novel["transcript_novelty"] = "Known"
else:
Expand Down Expand Up @@ -396,9 +400,14 @@ def make_novelty_type_struct(database, datasets):
cursor = conn.cursor()

novelty_type = dstruct.Struct()

# genes
novelty_type.known_genes = set(qutils.fetch_all_known_genes_detected(cursor, datasets))
novelty_type.antisense_genes = set(qutils.fetch_antisense_genes(cursor, datasets))
novelty_type.intergenic_genes = set(qutils.fetch_intergenic_novel_genes(cursor, datasets))
novelty_type.fusion_genes = set(qutils.fetch_fusion_novel_genes(cursor, datasets))

# transcripts
novelty_type.known_transcripts = set(qutils.fetch_all_known_transcripts_detected(cursor, datasets))
novelty_type.ISM_transcripts = set(qutils.fetch_all_ISM_transcripts(cursor, datasets))
novelty_type.ISM_prefix = set(qutils.fetch_prefix_ISM_transcripts(cursor, datasets))
Expand All @@ -408,6 +417,7 @@ def make_novelty_type_struct(database, datasets):
novelty_type.antisense_transcripts = set(qutils.fetch_antisense_transcripts(cursor, datasets))
novelty_type.intergenic_transcripts = set(qutils.fetch_intergenic_transcripts(cursor, datasets))
novelty_type.genomic_transcripts = set(qutils.fetch_genomic_transcripts(cursor, datasets))
novelty_type.fusion_transcripts = set(qutils.fetch_fusion_transcripts(cursor, datasets))

conn.close()
return novelty_type
Expand Down
36 changes: 33 additions & 3 deletions src/talon/query_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ def fetch_reproducible_NNCs(cursor, datasets):

datasets = format_for_IN(datasets)
query = (
"""SELECT gene_ID,
a.transcript_ID
"""SELECT gene_ID,
a.transcript_ID
FROM abundance as a
LEFT JOIN transcript_annotations as ta
ON ta.ID = a.transcript_ID
Expand Down Expand Up @@ -353,14 +353,29 @@ def fetch_intergenic_novel_genes(cursor, datasets):
genes = [x[0] for x in cursor.fetchall()]
return genes

def fetch_fusion_novel_genes(cursor, datasets):
"""Fetch IDs of novel genes denoted as fusion"""

datasets = format_for_IN(datasets)
query = (
"""SELECT DISTINCT(gene_ID) FROM observed
LEFT JOIN gene_annotations AS ga ON ga.ID = observed.gene_ID
WHERE (ga.attribute = 'fusion_novel')
AND observed.dataset IN """
+ datasets
)
cursor.execute(query)
genes = [x[0] for x in cursor.fetchall()]
return genes


def fetch_all_ISM_transcripts(cursor, datasets):
"""Fetch IDs of all ISM transcripts"""

datasets = format_for_IN(datasets)
query = (
"""SELECT DISTINCT(transcript_ID) FROM observed
LEFT JOIN transcript_annotations
LEFT JOIN transcript_annotations
AS ta ON ta.ID = observed.transcript_ID
WHERE (ta.attribute = 'ISM_transcript')
AND observed.dataset IN """
Expand Down Expand Up @@ -489,6 +504,21 @@ def fetch_genomic_transcripts(cursor, datasets):
transcripts = [x[0] for x in cursor.fetchall()]
return transcripts

def fetch_fusion_transcripts(cursor, datasets):
"""Fetch IDs of all fusion transcripts"""

datasets = format_for_IN(datasets)
query = (
"""SELECT DISTINCT(transcript_ID) FROM observed
LEFT JOIN transcript_annotations
AS ta ON ta.ID = observed.transcript_ID
WHERE (ta.attribute = 'fusion_transcript')
AND observed.dataset IN """
+ datasets
)
cursor.execute(query)
transcripts = [x[0] for x in cursor.fetchall()]
return transcripts

def fetch_all_transcript_gene_pairs(cursor):
"""Return gene_ID - transcript_ID tuples from database"""
Expand Down

0 comments on commit 934d2f8

Please sign in to comment.