From 77ab3a420af60063568e7e533a8d62c83f05bbb9 Mon Sep 17 00:00:00 2001
From: ljyanesm <yanes.luis@gmail.com>
Date: Sat, 4 Sep 2021 06:55:00 +0100
Subject: [PATCH] Accepts different genetic codes

---
 annotation/__main__.py                        | 32 ++++++-----
 annotation/transcriptome.py                   | 56 +++++++++++++++++--
 annotation/transcriptome_module/main.wdl      |  4 ++
 annotation/transcriptome_module/mikado.wdl    | 12 ++++
 .../mikado/orf_caller/wf_prodigal.wdl         | 28 ----------
 .../subworkflows/mikado/wf_mikado.wdl         |  9 ++-
 6 files changed, 92 insertions(+), 49 deletions(-)

diff --git a/annotation/__main__.py b/annotation/__main__.py
index 9672f80..a9c4d5b 100644
--- a/annotation/__main__.py
+++ b/annotation/__main__.py
@@ -38,7 +38,8 @@
 
 from annotation import VERSION
 from annotation.homology import combine_arguments_homology, validate_homology_inputs
-from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs
+from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs, transcriptome_cli_validation, \
+    genetic_code_str_to_int
 
 LONG_READ_ALIGNER_CHOICES = ['minimap2', 'gmap', '2pass', '2pass_merged']
 RUN_METADATA = "run_details.json"
@@ -225,6 +226,12 @@ def parse_arguments():
                                   help="Base parameters file, this file can be the output of a previous REAT run "
                                        "which will be used as the base for a new parameters file written to the"
                                        " output_parameters_file argument")
+    transcriptome_ap.add_argument("--genetic_code",
+                                  help=f"Parameter for the translation table used in Mikado for translating CDS "
+                                       f"sequences, and for ORF calling, can take values in the genetic code range of "
+                                       f"NCBI as an integer. E.g 1, 6, 10 "
+                                       f"or when using TransDecoder as ORF caller, one of: "
+                                       f"{', '.join(genetic_code_str_to_int.keys())}", default='1')
 
     # Mikado arguments
     mikado_parameters = transcriptome_ap.add_argument_group("Mikado", "Parameters for Mikado runs")
@@ -440,18 +447,11 @@ def parse_arguments():
 
     args = reat_ap.parse_args()
 
+    genetic_code = 0
+    mikado_genetic_code = 0
     if args.reat_module == 'transcriptome':
-        if args.separate_mikado_LQ:
-            if not args.long_lq_scoring_file:
-                reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please "
-                              "provide it.")
-
-        if args.samples and (args.csv_paired_samples or args.csv_long_samples):
-            reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] "
-                          "provided, please choose one of csv or json sample input format")
-        if not args.samples and not args.csv_paired_samples and not args.csv_long_samples:
-            reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples")
-    return args
+        genetic_code, mikado_genetic_code = transcriptome_cli_validation(args, reat_ap)
+    return args, genetic_code, mikado_genetic_code
 
 
 def symlink(path, out_file):
@@ -642,7 +642,7 @@ def link_bams(outputs, outputs_path, bams_array, stats_array, stats_table):
         symlink(alignments_path, outputs[stats_table])
 
 
-def transcriptome_module(cli_arguments):
+def transcriptome_module(cli_arguments, genetic_code='1', mikado_genetic_code=1):
     """
     Collects the CLI arguments and combines them with CLI defined input files.
     The resulting object is validated and the final inputs are written to a json Cromwell input file.
@@ -651,6 +651,8 @@ def transcriptome_module(cli_arguments):
     """
     # Print input file for cromwell
     cromwell_inputs = combine_arguments(cli_arguments)
+    cromwell_inputs['ei_annotation.mikado_genetic_code'] = mikado_genetic_code
+    cromwell_inputs['ei_annotation.genetic_code'] = str(genetic_code)
     cromwell_jar, runtime_config = prepare_cromwell_arguments(cli_arguments)
 
     # Validate input against schema
@@ -880,7 +882,7 @@ def main():
     print("\n")
 
     start_time = time.time()
-    cli_arguments = parse_arguments()
+    cli_arguments, genetic_code, mikado_genetic_code = parse_arguments()
     check_environment()
 
     # Check options.json
@@ -893,7 +895,7 @@ def main():
         print('... ' if start else '', snippet, ' ...' if stop < len(err.doc) else '', sep="")
 
     if cli_arguments.reat_module == "transcriptome":
-        rc = transcriptome_module(cli_arguments)
+        rc = transcriptome_module(cli_arguments, genetic_code, mikado_genetic_code)
     elif cli_arguments.reat_module == "homology":
         rc = homology_module(cli_arguments)
     else:
diff --git a/annotation/transcriptome.py b/annotation/transcriptome.py
index 8b71aba..e8400fc 100644
--- a/annotation/transcriptome.py
+++ b/annotation/transcriptome.py
@@ -12,6 +12,20 @@
 
 from annotation import report_errors
 
+UNSUPPORTED_GENETIC_CODES_INT = [2, 3, 4, 5, 9, 11, 13, 14, 16, 21, 22, 23, 24]
+
+genetic_code_str_to_int = {'Universal': 1,
+                           'Tetrahymena': 6,
+                           'Acetabularia': 6,
+                           'Ciliate': 6,
+                           'Dasycladacean': 6,
+                           'Hexamita': 6,
+                           'Candida': 1,
+                           'Euplotid': 10,
+                           'SR1_Gracilibacteria': 25,
+                           'Pachysolen_tannophilus': 1,
+                           'Peritrich': 6}
+
 try:
     from yaml import CLoader as Loader, CDumper as Dumper, load, dump
 except ImportError:
@@ -49,15 +63,15 @@ def separate_mikado_config(mikado_config, mikado_run):
     if any((prepare, serialise, pick)):
         Path(mikado_run).mkdir(exist_ok=True)
         if prepare != previous_prepare:
-            prepare_path = Path(mikado_run).joinpath(ctimestamp+"-prepare.yaml")
+            prepare_path = Path(mikado_run).joinpath(ctimestamp + "-prepare.yaml")
             print(dump({'prepare': prepare}, default_flow_style=False),
                   file=open(prepare_path, 'w'))
         if serialise != previous_serialise:
-            serialise_path = Path(mikado_run).joinpath(ctimestamp+"-serialise.yaml")
+            serialise_path = Path(mikado_run).joinpath(ctimestamp + "-serialise.yaml")
             print(dump({'serialise': serialise}, default_flow_style=False),
                   file=open(serialise_path, 'w'))
         if pick != previous_pick:
-            pick_path = Path(mikado_run).joinpath(ctimestamp+"-pick.yaml")
+            pick_path = Path(mikado_run).joinpath(ctimestamp + "-pick.yaml")
             print(dump({'pick': pick}, default_flow_style=False),
                   file=open(pick_path, 'w'))
 
@@ -218,7 +232,8 @@ def validate_paired_samples(samples):
         if name in names:
             errors[line].append(("Non-unique label specified: '{}'".format(name)))
         if strand.lower() not in strands:
-            errors[line].append(("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands)))
+            errors[line].append(
+                ("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands)))
 
         merge = merge.lower()
         if merge in ("true", "false"):
@@ -464,3 +479,36 @@ def validate_transcriptome_inputs(cromwell_inputs):
     all_validators["is_name"] = is_valid_name
     reat_validator = validators.create(meta_schema=reat_schema, validators=all_validators)
     reat_validator(reat_schema).validate(cromwell_inputs)
+
+
+def transcriptome_cli_validation(args, reat_ap):
+    if args.separate_mikado_LQ:
+        if not args.long_lq_scoring_file:
+            reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please "
+                          "provide it.")
+    if args.samples and (args.csv_paired_samples or args.csv_long_samples):
+        reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] "
+                      "provided, please choose one of csv or json sample input format")
+    if not args.samples and not args.csv_paired_samples and not args.csv_long_samples:
+        reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples")
+
+    if args.genetic_code.isdigit():
+        genetic_code = int(args.genetic_code)
+        if genetic_code in UNSUPPORTED_GENETIC_CODES_INT or genetic_code > 25:
+            reat_ap.error(
+                f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of "
+                f"{set(range(0, 25)) - set(UNSUPPORTED_GENETIC_CODES_INT)}")
+        if args.orf_caller == 'transdecoder':
+            reat_ap.error(f"Please use one of {', '.join(genetic_code_str_to_int.keys())}, as these define "
+                          f"more specific genetic codes when using TransDecoder as the ORF caller")
+    else:
+        genetic_code = 0
+        try:
+            genetic_code = genetic_code_str_to_int[args.genetic_code]
+        except KeyError:
+            reat_ap.error(
+                f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of "
+                f"{', '.join(genetic_code_str_to_int.keys())}")
+    mikado_genetic_code = genetic_code
+
+    return genetic_code, mikado_genetic_code
\ No newline at end of file
diff --git a/annotation/transcriptome_module/main.wdl b/annotation/transcriptome_module/main.wdl
index 22b147a..edf3544 100644
--- a/annotation/transcriptome_module/main.wdl
+++ b/annotation/transcriptome_module/main.wdl
@@ -11,6 +11,8 @@ workflow ei_annotation {
         Array[LRSample]? HQ_long_read_samples
         File? annotation
         Int annotation_score = 1
+        String genetic_code
+        Int mikado_genetic_code
         Boolean merge_all_junctions = false
         String mode
         Boolean check_reference
@@ -72,6 +74,8 @@ workflow ei_annotation {
         LQ_assemblies = wf_align.LQ_gff,
         HQ_assemblies = wf_align.HQ_gff,
         orf_calling_proteins = orf_calling_proteins,
+        genetic_code = genetic_code,
+        mikado_genetic_code = mikado_genetic_code,
         homology_proteins = homology_proteins,
         annotation_score = annotation_score,
         mode = mode,
diff --git a/annotation/transcriptome_module/mikado.wdl b/annotation/transcriptome_module/mikado.wdl
index 8c27ffa..8c2cbc1 100644
--- a/annotation/transcriptome_module/mikado.wdl
+++ b/annotation/transcriptome_module/mikado.wdl
@@ -11,6 +11,8 @@ workflow wf_main_mikado {
 
         Int annotation_score = 1
         String mode
+        String genetic_code
+        Int mikado_genetic_code
         Boolean check_reference
 
         File all_scoring_file
@@ -158,6 +160,8 @@ workflow wf_main_mikado {
             scoring_file = all_scoring_file,
             orf_calling_proteins = orf_calling_proteins,
             orf_caller = orf_calling_program,
+            genetic_code = genetic_code,
+            mikado_genetic_code = mikado_genetic_code,
             mikado_do_homology_assessment = run_mikado_homology,
             homology_proteins = homology_proteins,
             junctions = def_junctions,
@@ -187,6 +191,8 @@ workflow wf_main_mikado {
                 scoring_file = long_scoring_file,
                 orf_calling_proteins = orf_calling_proteins,
                 orf_caller = orf_calling_program,
+                genetic_code = genetic_code,
+                mikado_genetic_code = mikado_genetic_code,
                 mikado_do_homology_assessment = run_mikado_homology,
                 homology_proteins = homology_proteins,
                 junctions = def_junctions,
@@ -218,6 +224,8 @@ workflow wf_main_mikado {
                 junctions = def_junctions,
                 orf_calling_proteins = orf_calling_proteins,
                 orf_caller = orf_calling_program,
+                genetic_code = genetic_code,
+                mikado_genetic_code = mikado_genetic_code,
                 mikado_do_homology_assessment = run_mikado_homology,
                 homology_proteins = homology_proteins,
                 output_prefix = "mikado_longLQ",
@@ -251,6 +259,8 @@ workflow wf_main_mikado {
             junctions = def_junctions,
             orf_calling_proteins = orf_calling_proteins,
             orf_caller = orf_calling_program,
+            genetic_code = genetic_code,
+            mikado_genetic_code = mikado_genetic_code,
             mikado_do_homology_assessment = run_mikado_homology,
             homology_proteins = homology_proteins,
             output_prefix = "mikado_all",
@@ -281,6 +291,8 @@ workflow wf_main_mikado {
                 junctions = def_junctions,
                 orf_calling_proteins = orf_calling_proteins,
                 orf_caller = orf_calling_program,
+                genetic_code = genetic_code,
+                mikado_genetic_code = mikado_genetic_code,
                 mikado_do_homology_assessment = run_mikado_homology,
                 homology_proteins = homology_proteins,
                 output_prefix = "mikado_long",
diff --git a/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl b/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl
index a02e24c..36c56c3 100644
--- a/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl
+++ b/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl
@@ -9,7 +9,6 @@ workflow wf_prodigal {
         RuntimeAttr? prodigal_runtime_attr
     }
 
-    # TODO: Chunk input, train first then subdivide transcripts for parallel processing
     call Prodigal {
         input:
         prepared_transcripts = prepared_transcripts,
@@ -27,33 +26,6 @@ task Prodigal {
     input {
         File prepared_transcripts
         Int gencode_id
-# gencode_name list (id, names):
-#    1  ['Standard', 'SGC0']
-#    2  ['Vertebrate Mitochondrial', 'SGC1']
-#    3  ['Yeast Mitochondrial', 'SGC2']
-#    4  ['Mold Mitochondrial', 'Protozoan Mitochondrial', 'Coelenterate Mitochondrial', 'Mycoplasma', 'Spiroplasma', 'SGC3']
-#    5  ['Invertebrate Mitochondrial', 'SGC4']
-#    6  ['Ciliate Nuclear', 'Dasycladacean Nuclear', 'Hexamita Nuclear', 'SGC5']
-#    9  ['Echinoderm Mitochondrial', 'Flatworm Mitochondrial', 'SGC8']
-#    10 ['Euplotid Nuclear', 'SGC9']
-#    11 ['Bacterial', 'Archaeal', 'Plant Plastid']
-#    12 ['Alternative Yeast Nuclear']
-#    13 ['Ascidian Mitochondrial']
-#    14 ['Alternative Flatworm Mitochondrial']
-#    15 ['Blepharisma Macronuclear']
-#    16 ['Chlorophycean Mitochondrial']
-#    21 ['Trematode Mitochondrial']
-#    22 ['Scenedesmus obliquus Mitochondrial']
-#    23 ['Thraustochytrium Mitochondrial']
-#    24 ['Pterobranchia Mitochondrial']
-#    25 ['Candidate Division SR1', 'Gracilibacteria']
-#    26 ['Pachysolen tannophilus Nuclear']
-#    27 ['Karyorelict Nuclear']
-#    28 ['Condylostoma Nuclear']
-#    29 ['Mesodinium Nuclear']
-#    30 ['Peritrich Nuclear']
-#    31 ['Blastocrithidia Nuclear']
-#    32 ['Balanophoraceae Plastid']
 
         String output_directory
         RuntimeAttr? runtime_attr_override
diff --git a/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl b/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl
index 7e2d8a6..7d75a9a 100644
--- a/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl
+++ b/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl
@@ -15,6 +15,8 @@ workflow wf_mikado {
         File? annotation
         Int annotation_score
         String mode
+        String genetic_code
+        Int mikado_genetic_code
         Boolean check_reference
 
         File scoring_file
@@ -49,6 +51,7 @@ workflow wf_mikado {
         reference_fasta = indexed_reference.fasta,
         blast_targets = homology_proteins,
         mode = mode,
+        mikado_genetic_code = mikado_genetic_code,
         jsamples = jsamples,
         annotation = annotation,
         annotation_score = annotation_score,
@@ -71,7 +74,7 @@ workflow wf_mikado {
         if (def_orf_caller == "prodigal") {
             call pdg.wf_prodigal {
                 input:
-                gencode = prodigal_gencode,
+                gencode = genetic_code,
                 prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]),
                 output_directory = output_prefix,
                 prodigal_runtime_attr = orf_calling_resources
@@ -84,7 +87,7 @@ workflow wf_mikado {
                 prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]),
                 orf_proteins = orf_calling_proteins,
                 orf_calling_resources = orf_calling_resources,
-                genetic_code = transdecoder_genetic_code,
+                genetic_code = genetic_code,
                 index_resources = orf_protein_index_resources,
                 output_prefix = output_prefix,
                 orf_alignment_resources = orf_protein_alignment_resources,
@@ -340,6 +343,7 @@ task MikadoPrepare {
     input {
         File reference_fasta
         String mode
+        Int mikado_genetic_code
         Boolean check_reference
 
         Array[Array[AssembledSample]] jsamples
@@ -400,6 +404,7 @@ task MikadoPrepare {
         mikado configure ${mode_parameter} ~{if(check_reference) then "--check-references" else ""} \
         ~{"-bt " + blast_targets} \
         --list=model.list \
+        --codon-table=~{mikado_genetic_code} \
         ~{"--reference=" + reference_fasta} \
         ~{output_prefix}-mikado.yaml