Merge branch 'dev' into 'main'

next release See merge request tron/easyfuse-pipeline!47
TRON-Bioinformatics · Dec 12, 2024 · 9e0f8bb · 9e0f8bb
2 parents 5b7ae74 + d522d93
commit 9e0f8bb
Show file tree

Hide file tree

Showing 28 changed files with 98,710 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,9 @@ trace.txt*
 report.html*
 test/output
 conda_env
-__pycache__/
+__pycache__/
+trace*.txt
+dag*.html
+report*.html
+timeline*.html
+venv/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## Unreleased
+
+### Added
+
+### Changed
+
+### Fixed
+
+### Removed
+
+## [2.0.4] - 2024-12-09
+
+### Added
+
+- Added full length protein sequence to the final output
+- Specifiy computational requirements via predefined labels: single, low and medium
+
+### Changed
+
+- Updated NextflowVersion to 24.10.1
+- Updated resource management
+- Fixed exon count in final output
+- Fixed tool_frac column in final output
+- Updated prediction model based on new results
+
 ## [2.0.3] - 2024-04-04
 
 ### Added
@@ -144,7 +170,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Initial release on GitHub
 
 
-[2.1.0]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.2...v2.1.0
+[Unreleased]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.3...dev
+[2.0.4]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.3...v2.0.4
+[2.0.3]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.2...v2.0.3
 [2.0.2]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.1...v2.0.2
 [2.0.1]: https://github.com/TRON-Bioinformatics/EasyFuse/v2.0.0...v2.0.1
 [2.0.0]: https://github.com/TRON-Bioinformatics/EasyFuse/v1.3.7...v2.0.0

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ The current version of EasyFuse uses three fusion gene detection tools, [STAR-Fu
 
 ### Dependencies
 
- - [NextFlow, 23.10.1](https://www.nextflow.io/)
+ - [NextFlow, 24.10.1](https://www.nextflow.io/)
  - [Conda](https://docs.anaconda.com/free/anaconda/install/index.html)
 
 Please have a look at environment.yml.
@@ -124,6 +124,8 @@ Overview of all features/columns annotated by EasyFuse:
   - `trans_inv`:  Genes on different chromosomes, different strands
 
 - **exon_nr:** Number of exons involved in the fusion transcript  
+- **ft1_exon_nr:** Exon number of fusion partner 1 that is invoved in building the transcript breakpoint 
+- **ft2_exon_nr:** Exon number of fusion partner 2 that is invoved in building the transcript breakpoint 
 - **exon_starts:** Genomic starting positions of involved exons
 - **exon_ends:** Genomic end positions of involved exons  
 - **exon_boundary1:** Exon boundary of the breakpoint in Gene1 
@@ -147,7 +149,9 @@ Overview of all features/columns annotated by EasyFuse:
 - **context_sequence:** The fusion transcript sequence downstream and upstream from the breakpoint (default 800 bp, shorter if transcript start or end occurs within the region)  
 - **context_sequence_bp:** Position of breakpoint in context sequence  
 - **neo_peptide_sequence:** Translated peptide sequence of context sequence starting at 13 aa before breakpoint until 13 aa after breakpoint (for in-frame transcripts) or until next stop codon (for out frame and neo frame). This is to consider only the region around the breakpoint that may contain neo-epitopes.
-- **neo_peptide_sequence_bp:** Breakpoint on translated peptide sequence.  
+- **neo_peptide_sequence_bp:** Breakpoint on translated peptide sequence. 
+- **fusion_protein_sequence:** Full-length protein sequence
+- **fusion_protein_sequence_bp:** Position of breakpoint in full-length protein seqeunce  
 - ***toolname*_detected:** 1 if breakpoint was detected by respective tool, 0 if not
 - ***toolname*_junc:** Junction read count (reads covering breakpoint) reported by *toolname*  
 - ***toolname*_span:** Spanning read count (read pairs with each partner on one side of breakpoint) reported by *toolname*  

diff --git a/bin/fusionannotation.py b/bin/fusionannotation.py
@@ -128,6 +128,7 @@ def define_type(
     def get_bp_overlapping_features(self, bp_chr, bp_pos):
         """Get two lists with breakpoint overlapping exons and cds from the database"""
 
+        #TODO: Annotate intergenic/intronic breakpoints
         bp_exons = []
         bp_cds = []
         for efeature in self.db.region(
@@ -219,22 +220,47 @@ def get_parents(self, efeature):
             )
         return trans_id, trans_biotype, gene_name, gene_biotype, description
 
-    def get_wt_codings(self, trans_id):
+
+    def get_wt_codings(self, trans_id, bp, is_bp1=True, strand="+"):
         """Get matched exons and cds region per transcript id"""
+
         exon_pos_list = []
+        exon_until_bp_pos_list = []
         cds_pos_list = []
+        cds_until_bp_pos_list = []
         cds_frame_list = []
         for feature in self.db.children(
             trans_id, featuretype=("exon", "CDS"), order_by="start", reverse=False
         ):
             if feature.featuretype == "exon":
+                if strand == "+" and is_bp1 or strand == "-" and not is_bp1:
+                    if feature.stop >= bp:
+                        exon_until_bp_pos_list.append((feature.start, bp))
+                    else:
+                        exon_until_bp_pos_list.append((feature.start, feature.stop))
+                else:
+                    if feature.start <= bp:
+                        exon_until_bp_pos_list.append((bp, feature.stop))
+                    else:
+                        exon_until_bp_pos_list.append((feature.start, feature.stop))
                 exon_pos_list.append((feature.start, feature.stop))
             elif feature.featuretype == "CDS":
+                if strand == "+" and is_bp1 or strand == "-" and not is_bp1:
+                    if feature.stop >= bp:
+                        cds_until_bp_pos_list.append((feature.start, bp))
+                    else:
+                        cds_until_bp_pos_list.append((feature.start, feature.stop))
+                else:
+                    if feature.start <= bp:
+                        cds_until_bp_pos_list.append((bp, feature.stop))
+                    else:
+                        cds_until_bp_pos_list.append((feature.start, feature.stop))
                 cds_pos_list.append((feature.start, feature.stop))
                 cds_frame_list.append(feature.frame)
         # correct list positions in such a way that the a cds is enclosed by an exon or not all available
         # exons are the scaffold as there can be exons w/o cds but not vv
         cds_pos_list2 = []
+        cds_until_bp_pos_list2 = []
         cds_frame_list2 = []
         for exon_start, exon_stop in exon_pos_list:
             exon_has_cds = False
@@ -250,7 +276,21 @@ def get_wt_codings(self, trans_id):
             if not exon_has_cds:
                 cds_pos_list2.append("NA")
                 cds_frame_list2.append("NA")
-        return exon_pos_list, cds_pos_list2, cds_frame_list2
+
+        for exon_start, exon_stop in exon_until_bp_pos_list:
+            exon_has_cds = False
+            for cds_counter, (cds_start, cds_stop) in enumerate(cds_until_bp_pos_list, 0):
+                if cds_start >= exon_start and cds_stop <= exon_stop:
+                    cds_until_bp_pos_list2.append((cds_start, cds_stop))
+                    if cds_start > exon_start and cds_stop < exon_stop:
+                        self.trans_flags_dict[trans_id].add(
+                            "cds != exon at no {}".format(cds_counter)
+                        )
+                    exon_has_cds = True
+            if not exon_has_cds:
+                cds_until_bp_pos_list2.append("NA")
+        return exon_pos_list, exon_until_bp_pos_list, cds_pos_list2, cds_until_bp_pos_list2, cds_frame_list2
+
 
     @staticmethod
     def check_exon_overlap(wt1_exon_pos_list, wt2_exon_pos_list, fusion_type):
@@ -430,6 +470,8 @@ def run(
             "context_sequence_100_id",
             "type",
             "exon_nr",
+            "ft1_exon_nr",
+            "ft2_exon_nr",
             "exon_starts",
             "exon_ends",
             "exon_boundary1",
@@ -442,6 +484,8 @@ def run(
             "context_sequence_bp",
             "neo_peptide_sequence",
             "neo_peptide_sequence_bp",
+            "fusion_protein_sequence",
+            "fusion_protein_sequence_bp",
             "context_sequence_wt1",
             "context_sequence_wt2",
             "context_sequence_wt1_bp",
@@ -513,7 +557,7 @@ def run(
         # information in the header is complete and useful for debugging, information in the short_header is identical
         # to the format of the previous
         # version of the fusion annotation and therefore used for downstream processing
-        short_header = header[0:21]
+        short_header = header[0:25]
         results_lists = []
 
         # get features overlapping with the bp from the db
@@ -560,9 +604,11 @@ def run(
                 # the exon) and start frames of the cds
                 (
                     wt1_exon_pos_list,
+                    wt1_exon_until_bp_pos_list,
                     wt1_cds_pos_list,
+                    wt1_cds_until_bp_pos_list,
                     wt1_cds_frame_list,
-                ) = self.get_wt_codings(wt1_trans_id)
+                ) = self.get_wt_codings(wt1_trans_id, bp1_pos, True, bp1_strand)
                 # get the frame at the start of the first cds and at the breakpoint
                 wt1_frame_at_start, wt1_frame_at_bp = self.get_frame(
                     bp1_pos, wt1_cds_pos_list, wt1_cds_frame_list, bp1_strand
@@ -612,9 +658,11 @@ def run(
 
                     (
                         wt2_exon_pos_list,
+                        wt2_exon_until_bp_pos_list,
                         wt2_cds_pos_list,
+                        wt2_cds_until_bp_pos_list,
                         wt2_cds_frame_list,
-                    ) = self.get_wt_codings(wt2_trans_id)
+                    ) = self.get_wt_codings(wt2_trans_id, bp2_pos, False, bp2_strand)
                     # check whether loci of wt1/wt2 overlap
                     if self.check_exon_overlap(
                         wt1_exon_pos_list, wt2_exon_pos_list, fusion_type
@@ -650,7 +698,9 @@ def run(
                     )
                     # Collect positions for which we need to grep sequences
                     self.fill_seq_lookup_dict(bp1_chr, wt1_exon_pos_list)
+                    #self.fill_seq_lookup_dict(bp1_chr, wt1_exon_until_bp_pos_list)
                     self.fill_seq_lookup_dict(bp2_chr, wt2_exon_pos_list)
+                    #self.fill_seq_lookup_dict(bp2_chr, wt2_exon_until_bp_pos_list)
                     self.fill_seq_lookup_dict(bp1_chr, ft1_exon_pos_list)
                     self.fill_seq_lookup_dict(bp2_chr, ft2_exon_pos_list)
                     self.fill_seq_lookup_dict(bp1_chr, wt1_cds_pos_list)
@@ -662,6 +712,9 @@ def run(
                         exon_nr = len(ft1_cds_pos_list) + len(ft2_cds_pos_list)
                     else:
                         exon_nr = len(ft1_cds_pos_list) + len(ft2_exon_pos_list)
+
+                    ft1_exon_nr = len(ft1_exon_pos_list)
+                    ft2_exon_nr = len(wt2_exon_pos_list) - len(ft2_exon_pos_list) + 1
                     wt1_is_good_transcript = self.trans_flags_dict[wt1_trans_id]
                     wt2_is_good_transcript = self.trans_flags_dict[wt2_trans_id]
                     results_lists.append(
@@ -675,6 +728,8 @@ def run(
                             "context_sequence_100_id",
                             fusion_type,
                             str(exon_nr),
+                            str(ft1_exon_nr),
+                            str(ft2_exon_nr),
                             "exon_starts",
                             "exon_ends",
                             exon_boundary1,
@@ -687,6 +742,8 @@ def run(
                             "context_sequence_bp",
                             "neo_peptide_sequence",
                             "neo_peptide_sequence_bp",
+                            "fusion_protein_sequence",
+                            "fusion_protein_sequence_bp",
                             "context_sequence_wt1",
                             "context_sequence_wt2",
                             "context_sequence_wt1_bp",
@@ -978,6 +1035,14 @@ def run(
                 header_dict["fusion_peptide"]
             ][max(0, int(bp_in_fusion_aa) - 13) :]
 
+            result_list[header_dict["fusion_protein_sequence"]] = result_list[
+                header_dict["fusion_peptide"]
+            ]
+
+            result_list[header_dict["fusion_protein_sequence_bp"]] = round(
+                    bp_in_fusion_aa, 1
+                )
+
             if (int(bp_in_fusion_aa) - 13) < 0:
                 result_list[header_dict["neo_peptide_sequence_bp"]] = round(
                     bp_in_fusion_aa, 1

diff --git a/bin/merge_data.py b/bin/merge_data.py
@@ -186,9 +186,9 @@ def add_tool_counts(self, detected_fusions_data):
                             self.data[key]["{}_detected".format(tool.lower())] = "0"
                             self.data[key]["{}_junc".format(tool.lower())] = "0"
                             self.data[key]["{}_span".format(tool.lower())] = "0"
-                tool_count = len(detected_fusions_data[bpid])
-                #self.data[key]["tool_count"] = str(tool_count)
-                self.data[key]["tool_frac"] = str(float(tool_count)/len(self.fusion_tools))
+                        tool_count = len(detected_fusions_data[bpid])
+                        #self.data[key]["tool_count"] = str(tool_count)
+                        self.data[key]["tool_frac"] = str(float(tool_count)/len(self.fusion_tools))
         logger.info("Added tool count data.")
 
 
@@ -349,4 +349,4 @@ def main():
     summary.run()
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_full.rds b/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_full.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_full_rep.rds b/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_full_rep.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_requant.rds b/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_requant.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_requant_type.rds b/data/model/Fusion_modeling_FFPE_train_v37.random_forest.model_full_data.EF_requant_type.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_full.rds b/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_full.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_full_rep.rds b/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_full_rep.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_requant.rds b/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_requant.rds
diff --git a/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_requant_type.rds b/data/model/Fusion_modeling_FFPE_train_v38.random_forest.model_full_data.EF_requant_type.rds
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,6 @@
-name: conda_env
+name: easyfuse_env
 channels:
   - bioconda
 dependencies:
   - python=3.7
-  - nextflow=23.10.1
+  - nextflow=24.10.1
diff --git a/main.nf b/main.nf
@@ -4,7 +4,7 @@ nextflow.enable.dsl = 2
 
 include { FASTP } from './modules/01_qc'
 include { STAR ; STAR_ARRIBA ; READ_FILTER ; BAM2FASTQ } from './modules/02_alignment'
-include { FUSION_CATCHER ; STAR_FUSION ; ARRIBA; FUSION_CATCHER_INDEX } from './modules/03_fusion_callers'
+include { FUSION_CATCHER ; STAR_FUSION ; ARRIBA } from './modules/03_fusion_callers'
 include { FUSION_PARSER ; FUSION_ANNOTATION } from './modules/04_joint_fusion_calling'
 include { FUSION2CSV ; CSV2FASTA ; STAR_INDEX ; FUSION_FILTER ; STAR_CUSTOM ; READ_COUNT } from './modules/05_requantification'
 include { MERGE_DATA ; PREDICTION } from './modules/06_summarize'

diff --git a/modules/01_qc.nf b/modules/01_qc.nf
@@ -1,8 +1,7 @@
 
 process FASTP {
-    cpus 6
-    memory "8g"
     tag "${name}"
+    label 'process_low'
 
     conda ("${baseDir}/environments/qc.yml")
 

diff --git a/modules/02_alignment.nf b/modules/02_alignment.nf
@@ -1,8 +1,7 @@
 
 process STAR {
-    cpus 6
-    memory "32g"
     tag "${name}"
+    label 'process_medium'
 
     conda ("${baseDir}/environments/alignment.yml")
 
@@ -42,11 +41,9 @@ process STAR {
 }
 
 
-
 process STAR_ARRIBA {
-    cpus 6
-    memory "32g"
     tag "${name}"
+    label 'process_medium'
 
     conda ("${baseDir}/environments/alignment.yml")
 
@@ -84,9 +81,8 @@ process STAR_ARRIBA {
 
 
 process READ_FILTER {
-    cpus 1
-    memory "8g"
     tag "${name}"
+    label 'process_single'
 
     conda ("${baseDir}/environments/filtering.yml")
 
@@ -105,9 +101,8 @@ process READ_FILTER {
 }
 
 process BAM2FASTQ {
-    cpus 6
-    memory "8g"
     tag "${name}"
+    label 'process_low'
 
     conda ("${baseDir}/environments/samtools.yml")