Skip to content

Commit

Permalink
Accepts different genetic codes
Browse files Browse the repository at this point in the history
  • Loading branch information
ljyanesm committed Sep 4, 2021
1 parent 1b55933 commit 77ab3a4
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 49 deletions.
32 changes: 17 additions & 15 deletions annotation/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@

from annotation import VERSION
from annotation.homology import combine_arguments_homology, validate_homology_inputs
from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs
from annotation.transcriptome import combine_arguments, validate_transcriptome_inputs, transcriptome_cli_validation, \
genetic_code_str_to_int

LONG_READ_ALIGNER_CHOICES = ['minimap2', 'gmap', '2pass', '2pass_merged']
RUN_METADATA = "run_details.json"
Expand Down Expand Up @@ -225,6 +226,12 @@ def parse_arguments():
help="Base parameters file, this file can be the output of a previous REAT run "
"which will be used as the base for a new parameters file written to the"
" output_parameters_file argument")
transcriptome_ap.add_argument("--genetic_code",
help=f"Parameter for the translation table used in Mikado for translating CDS "
f"sequences, and for ORF calling, can take values in the genetic code range of "
f"NCBI as an integer. E.g 1, 6, 10 "
f"or when using TransDecoder as ORF caller, one of: "
f"{', '.join(genetic_code_str_to_int.keys())}", default='1')

# Mikado arguments
mikado_parameters = transcriptome_ap.add_argument_group("Mikado", "Parameters for Mikado runs")
Expand Down Expand Up @@ -440,18 +447,11 @@ def parse_arguments():

args = reat_ap.parse_args()

genetic_code = 0
mikado_genetic_code = 0
if args.reat_module == 'transcriptome':
if args.separate_mikado_LQ:
if not args.long_lq_scoring_file:
reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please "
"provide it.")

if args.samples and (args.csv_paired_samples or args.csv_long_samples):
reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] "
"provided, please choose one of csv or json sample input format")
if not args.samples and not args.csv_paired_samples and not args.csv_long_samples:
reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples")
return args
genetic_code, mikado_genetic_code = transcriptome_cli_validation(args, reat_ap)
return args, genetic_code, mikado_genetic_code


def symlink(path, out_file):
Expand Down Expand Up @@ -642,7 +642,7 @@ def link_bams(outputs, outputs_path, bams_array, stats_array, stats_table):
symlink(alignments_path, outputs[stats_table])


def transcriptome_module(cli_arguments):
def transcriptome_module(cli_arguments, genetic_code='1', mikado_genetic_code=1):
"""
Collects the CLI arguments and combines them with CLI defined input files.
The resulting object is validated and the final inputs are written to a json Cromwell input file.
Expand All @@ -651,6 +651,8 @@ def transcriptome_module(cli_arguments):
"""
# Print input file for cromwell
cromwell_inputs = combine_arguments(cli_arguments)
cromwell_inputs['ei_annotation.mikado_genetic_code'] = mikado_genetic_code
cromwell_inputs['ei_annotation.genetic_code'] = str(genetic_code)
cromwell_jar, runtime_config = prepare_cromwell_arguments(cli_arguments)

# Validate input against schema
Expand Down Expand Up @@ -880,7 +882,7 @@ def main():
print("\n")

start_time = time.time()
cli_arguments = parse_arguments()
cli_arguments, genetic_code, mikado_genetic_code = parse_arguments()
check_environment()

# Check options.json
Expand All @@ -893,7 +895,7 @@ def main():
print('... ' if start else '', snippet, ' ...' if stop < len(err.doc) else '', sep="")

if cli_arguments.reat_module == "transcriptome":
rc = transcriptome_module(cli_arguments)
rc = transcriptome_module(cli_arguments, genetic_code, mikado_genetic_code)
elif cli_arguments.reat_module == "homology":
rc = homology_module(cli_arguments)
else:
Expand Down
56 changes: 52 additions & 4 deletions annotation/transcriptome.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,20 @@

from annotation import report_errors

UNSUPPORTED_GENETIC_CODES_INT = [2, 3, 4, 5, 9, 11, 13, 14, 16, 21, 22, 23, 24]

genetic_code_str_to_int = {'Universal': 1,
'Tetrahymena': 6,
'Acetabularia': 6,
'Ciliate': 6,
'Dasycladacean': 6,
'Hexamita': 6,
'Candida': 1,
'Euplotid': 10,
'SR1_Gracilibacteria': 25,
'Pachysolen_tannophilus': 1,
'Peritrich': 6}

try:
from yaml import CLoader as Loader, CDumper as Dumper, load, dump
except ImportError:
Expand Down Expand Up @@ -49,15 +63,15 @@ def separate_mikado_config(mikado_config, mikado_run):
if any((prepare, serialise, pick)):
Path(mikado_run).mkdir(exist_ok=True)
if prepare != previous_prepare:
prepare_path = Path(mikado_run).joinpath(ctimestamp+"-prepare.yaml")
prepare_path = Path(mikado_run).joinpath(ctimestamp + "-prepare.yaml")
print(dump({'prepare': prepare}, default_flow_style=False),
file=open(prepare_path, 'w'))
if serialise != previous_serialise:
serialise_path = Path(mikado_run).joinpath(ctimestamp+"-serialise.yaml")
serialise_path = Path(mikado_run).joinpath(ctimestamp + "-serialise.yaml")
print(dump({'serialise': serialise}, default_flow_style=False),
file=open(serialise_path, 'w'))
if pick != previous_pick:
pick_path = Path(mikado_run).joinpath(ctimestamp+"-pick.yaml")
pick_path = Path(mikado_run).joinpath(ctimestamp + "-pick.yaml")
print(dump({'pick': pick}, default_flow_style=False),
file=open(pick_path, 'w'))

Expand Down Expand Up @@ -218,7 +232,8 @@ def validate_paired_samples(samples):
if name in names:
errors[line].append(("Non-unique label specified: '{}'".format(name)))
if strand.lower() not in strands:
errors[line].append(("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands)))
errors[line].append(
("Incorrect strand '{}' specification, please choose one of {}".format(strand, strands)))

merge = merge.lower()
if merge in ("true", "false"):
Expand Down Expand Up @@ -464,3 +479,36 @@ def validate_transcriptome_inputs(cromwell_inputs):
all_validators["is_name"] = is_valid_name
reat_validator = validators.create(meta_schema=reat_schema, validators=all_validators)
reat_validator(reat_schema).validate(cromwell_inputs)


def transcriptome_cli_validation(args, reat_ap):
if args.separate_mikado_LQ:
if not args.long_lq_scoring_file:
reat_ap.error("When '--separate_mikado_LQ' is enabled, --long_lq_scoring_file is required, please "
"provide it.")
if args.samples and (args.csv_paired_samples or args.csv_long_samples):
reat_ap.error("Conflicting arguments '--samples' and ['--csv_paired_samples' or '--csv_long_samples'] "
"provided, please choose one of csv or json sample input format")
if not args.samples and not args.csv_paired_samples and not args.csv_long_samples:
reat_ap.error("Please provide at least one of --samples, --csv_paired_samples, --csv_long_samples")

if args.genetic_code.isdigit():
genetic_code = int(args.genetic_code)
if genetic_code in UNSUPPORTED_GENETIC_CODES_INT or genetic_code > 25:
reat_ap.error(
f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of "
f"{set(range(0, 25)) - set(UNSUPPORTED_GENETIC_CODES_INT)}")
if args.orf_caller == 'transdecoder':
reat_ap.error(f"Please use one of {', '.join(genetic_code_str_to_int.keys())}, as these define "
f"more specific genetic codes when using TransDecoder as the ORF caller")
else:
genetic_code = 0
try:
genetic_code = genetic_code_str_to_int[args.genetic_code]
except KeyError:
reat_ap.error(
f"Sorry, genetic_code={args.genetic_code} is not supported, please use one of "
f"{', '.join(genetic_code_str_to_int.keys())}")
mikado_genetic_code = genetic_code

return genetic_code, mikado_genetic_code
4 changes: 4 additions & 0 deletions annotation/transcriptome_module/main.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ workflow ei_annotation {
Array[LRSample]? HQ_long_read_samples
File? annotation
Int annotation_score = 1
String genetic_code
Int mikado_genetic_code
Boolean merge_all_junctions = false
String mode
Boolean check_reference
Expand Down Expand Up @@ -72,6 +74,8 @@ workflow ei_annotation {
LQ_assemblies = wf_align.LQ_gff,
HQ_assemblies = wf_align.HQ_gff,
orf_calling_proteins = orf_calling_proteins,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
homology_proteins = homology_proteins,
annotation_score = annotation_score,
mode = mode,
Expand Down
12 changes: 12 additions & 0 deletions annotation/transcriptome_module/mikado.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ workflow wf_main_mikado {

Int annotation_score = 1
String mode
String genetic_code
Int mikado_genetic_code
Boolean check_reference

File all_scoring_file
Expand Down Expand Up @@ -158,6 +160,8 @@ workflow wf_main_mikado {
scoring_file = all_scoring_file,
orf_calling_proteins = orf_calling_proteins,
orf_caller = orf_calling_program,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
mikado_do_homology_assessment = run_mikado_homology,
homology_proteins = homology_proteins,
junctions = def_junctions,
Expand Down Expand Up @@ -187,6 +191,8 @@ workflow wf_main_mikado {
scoring_file = long_scoring_file,
orf_calling_proteins = orf_calling_proteins,
orf_caller = orf_calling_program,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
mikado_do_homology_assessment = run_mikado_homology,
homology_proteins = homology_proteins,
junctions = def_junctions,
Expand Down Expand Up @@ -218,6 +224,8 @@ workflow wf_main_mikado {
junctions = def_junctions,
orf_calling_proteins = orf_calling_proteins,
orf_caller = orf_calling_program,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
mikado_do_homology_assessment = run_mikado_homology,
homology_proteins = homology_proteins,
output_prefix = "mikado_longLQ",
Expand Down Expand Up @@ -251,6 +259,8 @@ workflow wf_main_mikado {
junctions = def_junctions,
orf_calling_proteins = orf_calling_proteins,
orf_caller = orf_calling_program,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
mikado_do_homology_assessment = run_mikado_homology,
homology_proteins = homology_proteins,
output_prefix = "mikado_all",
Expand Down Expand Up @@ -281,6 +291,8 @@ workflow wf_main_mikado {
junctions = def_junctions,
orf_calling_proteins = orf_calling_proteins,
orf_caller = orf_calling_program,
genetic_code = genetic_code,
mikado_genetic_code = mikado_genetic_code,
mikado_do_homology_assessment = run_mikado_homology,
homology_proteins = homology_proteins,
output_prefix = "mikado_long",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ workflow wf_prodigal {
RuntimeAttr? prodigal_runtime_attr
}

# TODO: Chunk input, train first then subdivide transcripts for parallel processing
call Prodigal {
input:
prepared_transcripts = prepared_transcripts,
Expand All @@ -27,33 +26,6 @@ task Prodigal {
input {
File prepared_transcripts
Int gencode_id
# gencode_name list (id, names):
# 1 ['Standard', 'SGC0']
# 2 ['Vertebrate Mitochondrial', 'SGC1']
# 3 ['Yeast Mitochondrial', 'SGC2']
# 4 ['Mold Mitochondrial', 'Protozoan Mitochondrial', 'Coelenterate Mitochondrial', 'Mycoplasma', 'Spiroplasma', 'SGC3']
# 5 ['Invertebrate Mitochondrial', 'SGC4']
# 6 ['Ciliate Nuclear', 'Dasycladacean Nuclear', 'Hexamita Nuclear', 'SGC5']
# 9 ['Echinoderm Mitochondrial', 'Flatworm Mitochondrial', 'SGC8']
# 10 ['Euplotid Nuclear', 'SGC9']
# 11 ['Bacterial', 'Archaeal', 'Plant Plastid']
# 12 ['Alternative Yeast Nuclear']
# 13 ['Ascidian Mitochondrial']
# 14 ['Alternative Flatworm Mitochondrial']
# 15 ['Blepharisma Macronuclear']
# 16 ['Chlorophycean Mitochondrial']
# 21 ['Trematode Mitochondrial']
# 22 ['Scenedesmus obliquus Mitochondrial']
# 23 ['Thraustochytrium Mitochondrial']
# 24 ['Pterobranchia Mitochondrial']
# 25 ['Candidate Division SR1', 'Gracilibacteria']
# 26 ['Pachysolen tannophilus Nuclear']
# 27 ['Karyorelict Nuclear']
# 28 ['Condylostoma Nuclear']
# 29 ['Mesodinium Nuclear']
# 30 ['Peritrich Nuclear']
# 31 ['Blastocrithidia Nuclear']
# 32 ['Balanophoraceae Plastid']

String output_directory
RuntimeAttr? runtime_attr_override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ workflow wf_mikado {
File? annotation
Int annotation_score
String mode
String genetic_code
Int mikado_genetic_code
Boolean check_reference

File scoring_file
Expand Down Expand Up @@ -49,6 +51,7 @@ workflow wf_mikado {
reference_fasta = indexed_reference.fasta,
blast_targets = homology_proteins,
mode = mode,
mikado_genetic_code = mikado_genetic_code,
jsamples = jsamples,
annotation = annotation,
annotation_score = annotation_score,
Expand All @@ -71,7 +74,7 @@ workflow wf_mikado {
if (def_orf_caller == "prodigal") {
call pdg.wf_prodigal {
input:
gencode = prodigal_gencode,
gencode = genetic_code,
prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]),
output_directory = output_prefix,
prodigal_runtime_attr = orf_calling_resources
Expand All @@ -84,7 +87,7 @@ workflow wf_mikado {
prepared_transcripts = select_first([FilterPrepare.filtered_prepared_fasta, MikadoPrepare.prepared_fasta]),
orf_proteins = orf_calling_proteins,
orf_calling_resources = orf_calling_resources,
genetic_code = transdecoder_genetic_code,
genetic_code = genetic_code,
index_resources = orf_protein_index_resources,
output_prefix = output_prefix,
orf_alignment_resources = orf_protein_alignment_resources,
Expand Down Expand Up @@ -340,6 +343,7 @@ task MikadoPrepare {
input {
File reference_fasta
String mode
Int mikado_genetic_code
Boolean check_reference

Array[Array[AssembledSample]] jsamples
Expand Down Expand Up @@ -400,6 +404,7 @@ task MikadoPrepare {
mikado configure ${mode_parameter} ~{if(check_reference) then "--check-references" else ""} \
~{"-bt " + blast_targets} \
--list=model.list \
--codon-table=~{mikado_genetic_code} \
~{"--reference=" + reference_fasta} \
~{output_prefix}-mikado.yaml

Expand Down

0 comments on commit 77ab3a4

Please sign in to comment.