From 77ab3a420af60063568e7e533a8d62c83f05bbb9 Mon Sep 17 00:00:00 2001 From: ljyanesm Date: Sat, 4 Sep 2021 06:55:00 +0100 Subject: [PATCH] Accepts different genetic codes --- annotation/__main__.py | 32 ++++++----- annotation/transcriptome.py | 56 +++++++++++++++++-- annotation/transcriptome_module/main.wdl | 4 ++ annotation/transcriptome_module/mikado.wdl | 12 ++++ .../mikado/orf_caller/wf_prodigal.wdl | 28 ---------- .../subworkflows/mikado/wf_mikado.wdl | 9 ++- 6 files changed, 92 insertions(+), 49 deletions(-) diff --git a/annotation/__main__.py b/annotation/__main__.py index 9672f80..a9c4d5b 100644 --- a/annotation/__main__.py +++ b/annotation/__main__.py @@ -38,7 +38,8 @@ from annotation import VERSION from annotation.homology import combine_arguments_homology, validate_homology_inputs -from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs +from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs, transcriptome_cli_validation, \ + genetic_code_str_to_int LONG_READ_ALIGNER_CHOICES = ['minimap2', 'gmap', '2pass', '2pass_merged'] RUN_METADATA = "run_details.json" @@ -225,6 +226,12 @@ def parse_arguments(): help="Base parameters file, this file can be the output of a previous REAT run " "which will be used as the base for a new parameters file written to the" " output_parameters_file argument") + transcriptome_ap.add_argument("--genetic_code", + help=f"Parameter for the translation table used in Mikado for translating CDS " + f"sequences, and for ORF calling, can take values in the genetic code range of " + f"NCBI as an integer. E.g 1, 6, 10 " + f"or when using TransDecoder as ORF caller, one of: " + f"{', '.join(genetic_code_str_to_int.keys())}", default='1') # Mikado arguments mikado_parameters = transcriptome_ap.add_argument_group("Mikado", "Parameters for Mikado runs") @@ -440,18 +447,11 @@ def parse_arguments(): args = reat_ap.parse_args() + genetic_code = 0 + mikado_genetic_code = 0 if args.reat_module == 'transcriptome': - if args.separate_mikado_LQ: - if not args.long_lq_scoring_file: - reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please " - "provide it.") - - if args.samples and (args.csv_paired_samples or args.csv_long_samples): - reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] " - "provided, please choose one of csv or json sample input format") - if not args.samples and not args.csv_paired_samples and not args.csv_long_samples: - reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples") - return args + genetic_code, mikado_genetic_code = transcriptome_cli_validation(args, reat_ap) + return args, genetic_code, mikado_genetic_code def symlink(path, out_file): @@ -642,7 +642,7 @@ def link_bams(outputs, outputs_path, bams_array, stats_array, stats_table): symlink(alignments_path, outputs[stats_table]) -def transcriptome_module(cli_arguments): +def transcriptome_module(cli_arguments, genetic_code='1', mikado_genetic_code=1): """ Collects the CLI arguments and combines them with CLI defined input files. The resulting object is validated and the final inputs are written to a json Cromwell input file. @@ -651,6 +651,8 @@ def transcriptome_module(cli_arguments): """ # Print input file for cromwell cromwell_inputs = combine_arguments(cli_arguments) + cromwell_inputs['ei_annotation.mikado_genetic_code'] = mikado_genetic_code + cromwell_inputs['ei_annotation.genetic_code'] = str(genetic_code) cromwell_jar, runtime_config = prepare_cromwell_arguments(cli_arguments) # Validate input against schema @@ -880,7 +882,7 @@ def main(): print("\n") start_time = time.time() - cli_arguments = parse_arguments() + cli_arguments, genetic_code, mikado_genetic_code = parse_arguments() check_environment() # Check options.json @@ -893,7 +895,7 @@ def main(): print('... ' if start else '', snippet, ' ...' if stop < len(err.doc) else '', sep="") if cli_arguments.reat_module == "transcriptome": - rc = transcriptome_module(cli_arguments) + rc = transcriptome_module(cli_arguments, genetic_code, mikado_genetic_code) elif cli_arguments.reat_module == "homology": rc = homology_module(cli_arguments) else: diff --git a/annotation/transcriptome.py b/annotation/transcriptome.py index 8b71aba..e8400fc 100644 --- a/annotation/transcriptome.py +++ b/annotation/transcriptome.py @@ -12,6 +12,20 @@ from annotation import report_errors +UNSUPPORTED_GENETIC_CODES_INT = [2, 3, 4, 5, 9, 11, 13, 14, 16, 21, 22, 23, 24] + +genetic_code_str_to_int = {'Universal': 1, + 'Tetrahymena': 6, + 'Acetabularia': 6, + 'Ciliate': 6, + 'Dasycladacean': 6, + 'Hexamita': 6, + 'Candida': 1, + 'Euplotid': 10, + 'SR1_Gracilibacteria': 25, + 'Pachysolen_tannophilus': 1, + 'Peritrich': 6} + try: from yaml import CLoader as Loader, CDumper as Dumper, load, dump except ImportError: @@ -49,15 +63,15 @@ def separate_mikado_config(mikado_config, mikado_run): if any((prepare, serialise, pick)): Path(mikado_run).mkdir(exist_ok=True) if prepare != previous_prepare: - prepare_path = Path(mikado_run).joinpath(ctimestamp+"-prepare.yaml") + prepare_path = Path(mikado_run).joinpath(ctimestamp + "-prepare.yaml") print(dump({'prepare': prepare}, default_flow_style=False), file=open(prepare_path, 'w')) if serialise != previous_serialise: - serialise_path = Path(mikado_run).joinpath(ctimestamp+"-serialise.yaml") + serialise_path = Path(mikado_run).joinpath(ctimestamp + "-serialise.yaml") print(dump({'serialise': serialise}, default_flow_style=False), file=open(serialise_path, 'w')) if pick != previous_pick: - pick_path = Path(mikado_run).joinpath(ctimestamp+"-pick.yaml") + pick_path = Path(mikado_run).joinpath(ctimestamp + "-pick.yaml") print(dump({'pick': pick}, default_flow_style=False), file=open(pick_path, 'w')) @@ -218,7 +232,8 @@ def validate_paired_samples(samples): if name in names: errors[line].append(("Non-unique label specified: '{}'".format(name))) if strand.lower() not in strands: - errors[line].append(("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands))) + errors[line].append( + ("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands))) merge = merge.lower() if merge in ("true", "false"): @@ -464,3 +479,36 @@ def validate_transcriptome_inputs(cromwell_inputs): all_validators["is_name"] = is_valid_name reat_validator = validators.create(meta_schema=reat_schema, validators=all_validators) reat_validator(reat_schema).validate(cromwell_inputs) + + +def transcriptome_cli_validation(args, reat_ap): + if args.separate_mikado_LQ: + if not args.long_lq_scoring_file: + reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please " + "provide it.") + if args.samples and (args.csv_paired_samples or args.csv_long_samples): + reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] " + "provided, please choose one of csv or json sample input format") + if not args.samples and not args.csv_paired_samples and not args.csv_long_samples: + reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples") + + if args.genetic_code.isdigit(): + genetic_code = int(args.genetic_code) + if genetic_code in UNSUPPORTED_GENETIC_CODES_INT or genetic_code > 25: + reat_ap.error( + f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of " + f"{set(range(0, 25)) - set(UNSUPPORTED_GENETIC_CODES_INT)}") + if args.orf_caller == 'transdecoder': + reat_ap.error(f"Please use one of {', '.join(genetic_code_str_to_int.keys())}, as these define " + f"more specific genetic codes when using TransDecoder as the ORF caller") + else: + genetic_code = 0 + try: + genetic_code = genetic_code_str_to_int[args.genetic_code] + except KeyError: + reat_ap.error( + f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of " + f"{', '.join(genetic_code_str_to_int.keys())}") + mikado_genetic_code = genetic_code + + return genetic_code, mikado_genetic_code \ No newline at end of file diff --git a/annotation/transcriptome_module/main.wdl b/annotation/transcriptome_module/main.wdl index 22b147a..edf3544 100644 --- a/annotation/transcriptome_module/main.wdl +++ b/annotation/transcriptome_module/main.wdl @@ -11,6 +11,8 @@ workflow ei_annotation { Array[LRSample]? HQ_long_read_samples File? annotation Int annotation_score = 1 + String genetic_code + Int mikado_genetic_code Boolean merge_all_junctions = false String mode Boolean check_reference @@ -72,6 +74,8 @@ workflow ei_annotation { LQ_assemblies = wf_align.LQ_gff, HQ_assemblies = wf_align.HQ_gff, orf_calling_proteins = orf_calling_proteins, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, homology_proteins = homology_proteins, annotation_score = annotation_score, mode = mode, diff --git a/annotation/transcriptome_module/mikado.wdl b/annotation/transcriptome_module/mikado.wdl index 8c27ffa..8c2cbc1 100644 --- a/annotation/transcriptome_module/mikado.wdl +++ b/annotation/transcriptome_module/mikado.wdl @@ -11,6 +11,8 @@ workflow wf_main_mikado { Int annotation_score = 1 String mode + String genetic_code + Int mikado_genetic_code Boolean check_reference File all_scoring_file @@ -158,6 +160,8 @@ workflow wf_main_mikado { scoring_file = all_scoring_file, orf_calling_proteins = orf_calling_proteins, orf_caller = orf_calling_program, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, mikado_do_homology_assessment = run_mikado_homology, homology_proteins = homology_proteins, junctions = def_junctions, @@ -187,6 +191,8 @@ workflow wf_main_mikado { scoring_file = long_scoring_file, orf_calling_proteins = orf_calling_proteins, orf_caller = orf_calling_program, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, mikado_do_homology_assessment = run_mikado_homology, homology_proteins = homology_proteins, junctions = def_junctions, @@ -218,6 +224,8 @@ workflow wf_main_mikado { junctions = def_junctions, orf_calling_proteins = orf_calling_proteins, orf_caller = orf_calling_program, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, mikado_do_homology_assessment = run_mikado_homology, homology_proteins = homology_proteins, output_prefix = "mikado_longLQ", @@ -251,6 +259,8 @@ workflow wf_main_mikado { junctions = def_junctions, orf_calling_proteins = orf_calling_proteins, orf_caller = orf_calling_program, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, mikado_do_homology_assessment = run_mikado_homology, homology_proteins = homology_proteins, output_prefix = "mikado_all", @@ -281,6 +291,8 @@ workflow wf_main_mikado { junctions = def_junctions, orf_calling_proteins = orf_calling_proteins, orf_caller = orf_calling_program, + genetic_code = genetic_code, + mikado_genetic_code = mikado_genetic_code, mikado_do_homology_assessment = run_mikado_homology, homology_proteins = homology_proteins, output_prefix = "mikado_long", diff --git a/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl b/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl index a02e24c..36c56c3 100644 --- a/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl +++ b/annotation/transcriptome_module/subworkflows/mikado/orf_caller/wf_prodigal.wdl @@ -9,7 +9,6 @@ workflow wf_prodigal { RuntimeAttr? prodigal_runtime_attr } - # TODO: Chunk input, train first then subdivide transcripts for parallel processing call Prodigal { input: prepared_transcripts = prepared_transcripts, @@ -27,33 +26,6 @@ task Prodigal { input { File prepared_transcripts Int gencode_id -# gencode_name list (id, names): -# 1 ['Standard', 'SGC0'] -# 2 ['Vertebrate Mitochondrial', 'SGC1'] -# 3 ['Yeast Mitochondrial', 'SGC2'] -# 4 ['Mold Mitochondrial', 'Protozoan Mitochondrial', 'Coelenterate Mitochondrial', 'Mycoplasma', 'Spiroplasma', 'SGC3'] -# 5 ['Invertebrate Mitochondrial', 'SGC4'] -# 6 ['Ciliate Nuclear', 'Dasycladacean Nuclear', 'Hexamita Nuclear', 'SGC5'] -# 9 ['Echinoderm Mitochondrial', 'Flatworm Mitochondrial', 'SGC8'] -# 10 ['Euplotid Nuclear', 'SGC9'] -# 11 ['Bacterial', 'Archaeal', 'Plant Plastid'] -# 12 ['Alternative Yeast Nuclear'] -# 13 ['Ascidian Mitochondrial'] -# 14 ['Alternative Flatworm Mitochondrial'] -# 15 ['Blepharisma Macronuclear'] -# 16 ['Chlorophycean Mitochondrial'] -# 21 ['Trematode Mitochondrial'] -# 22 ['Scenedesmus obliquus Mitochondrial'] -# 23 ['Thraustochytrium Mitochondrial'] -# 24 ['Pterobranchia Mitochondrial'] -# 25 ['Candidate Division SR1', 'Gracilibacteria'] -# 26 ['Pachysolen tannophilus Nuclear'] -# 27 ['Karyorelict Nuclear'] -# 28 ['Condylostoma Nuclear'] -# 29 ['Mesodinium Nuclear'] -# 30 ['Peritrich Nuclear'] -# 31 ['Blastocrithidia Nuclear'] -# 32 ['Balanophoraceae Plastid'] String output_directory RuntimeAttr? runtime_attr_override diff --git a/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl b/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl index 7e2d8a6..7d75a9a 100644 --- a/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl +++ b/annotation/transcriptome_module/subworkflows/mikado/wf_mikado.wdl @@ -15,6 +15,8 @@ workflow wf_mikado { File? annotation Int annotation_score String mode + String genetic_code + Int mikado_genetic_code Boolean check_reference File scoring_file @@ -49,6 +51,7 @@ workflow wf_mikado { reference_fasta = indexed_reference.fasta, blast_targets = homology_proteins, mode = mode, + mikado_genetic_code = mikado_genetic_code, jsamples = jsamples, annotation = annotation, annotation_score = annotation_score, @@ -71,7 +74,7 @@ workflow wf_mikado { if (def_orf_caller == "prodigal") { call pdg.wf_prodigal { input: - gencode = prodigal_gencode, + gencode = genetic_code, prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]), output_directory = output_prefix, prodigal_runtime_attr = orf_calling_resources @@ -84,7 +87,7 @@ workflow wf_mikado { prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]), orf_proteins = orf_calling_proteins, orf_calling_resources = orf_calling_resources, - genetic_code = transdecoder_genetic_code, + genetic_code = genetic_code, index_resources = orf_protein_index_resources, output_prefix = output_prefix, orf_alignment_resources = orf_protein_alignment_resources, @@ -340,6 +343,7 @@ task MikadoPrepare { input { File reference_fasta String mode + Int mikado_genetic_code Boolean check_reference Array[Array[AssembledSample]] jsamples @@ -400,6 +404,7 @@ task MikadoPrepare { mikado configure ${mode_parameter} ~{if(check_reference) then "--check-references" else ""} \ ~{"-bt " + blast_targets} \ --list=model.list \ + --codon-table=~{mikado_genetic_code} \ ~{"--reference=" + reference_fasta} \ ~{output_prefix}-mikado.yaml