From 8ea266b37dad2cfbedf149eca8e077947e6b5971 Mon Sep 17 00:00:00 2001 From: Sam Horsfield Date: Wed, 9 Oct 2024 11:51:45 +0100 Subject: [PATCH] Completes removal of high scoring ORF dict from panaroo runner --- ggCaller/__main__.py | 15 ++++++++------- ggCaller/graph_traversal.py | 1 + panaroo_runner/__main__.py | 30 ++++++------------------------ panaroo_runner/generate_output.py | 10 ++++++++++ 4 files changed, 25 insertions(+), 31 deletions(-) diff --git a/ggCaller/__main__.py b/ggCaller/__main__.py index 4d4d3ef..f475de2 100644 --- a/ggCaller/__main__.py +++ b/ggCaller/__main__.py @@ -530,7 +530,7 @@ def main(): options.truncation_threshold, options.save, options.refind) else: - print_ORF_calls(high_scoring_ORFs, os.path.join(output_dir, "gene_calls"), + print_ORF_calls(ORF_file_paths, os.path.join(output_dir, "gene_calls"), input_colours, overlap, graph) if options.save: @@ -544,16 +544,17 @@ def main(): # serialise graph object and high scoring ORFs to future reading graph[0].data_out(objects_dir + "ggc_graph.dat") - with open(objects_dir + "high_scoring_orfs.dat", "wb") as o: - cPickle.dump(high_scoring_ORFs, o) # create index of all high_scoring_ORFs node_IDs node_index = defaultdict(list) - for colour, gene_dict in high_scoring_ORFs.items(): - for ORF_ID, ORF_info in gene_dict.items(): - entry_ID = str(colour) + "_" + str(ORF_ID) + for colour_ID, file_path in ORF_file_paths.items(): + ORF_map = ggCaller_cpp.read_ORF_file(file_path) + + for ORF_ID, ORF_info in ORF_map.items(): + delim = "_0_" if ORF_ID > 0 else "_refound_" + entry_ID = str(colour_ID) + delim + str(ORF_ID) for node in ORF_info[0]: - node_index[node].append(entry_ID) + node_index[abs(node)].append(entry_ID) with open(objects_dir + "node_index.dat", "wb") as o: cPickle.dump(node_index, o) diff --git a/ggCaller/graph_traversal.py b/ggCaller/graph_traversal.py index 67f3e7e..1b1bdf7 100644 --- a/ggCaller/graph_traversal.py +++ b/ggCaller/graph_traversal.py @@ -2,6 +2,7 @@ import _pickle as cPickle from Bio import SeqIO +# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads): # check if objects_dir present, if not exit if not os.path.exists(objects_dir): diff --git a/panaroo_runner/__main__.py b/panaroo_runner/__main__.py index 06de1af..f9f3b8a 100644 --- a/panaroo_runner/__main__.py +++ b/panaroo_runner/__main__.py @@ -146,7 +146,7 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file G = find_missing(G, ORF_file_paths, shd_arr_tup, - kmer=kmer,ƒ + kmer=kmer, repeat=repeat, overlap=overlap, isolate_names=input_colours, @@ -339,13 +339,6 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file G.nodes[node]['dna'] = ";".join(conv_list(G.nodes[node]['dna'])) G.nodes[node]['protein'] = ";".join(conv_list(G.nodes[node]['protein'])) - # TODO sort out saving object, don't necessarily need to anymore as they are saved by default - # # add node annotation - # if save_objects: - # high_scoring_ORFs[genome_id][local_id] = list(high_scoring_ORFs[genome_id][local_id]) - # high_scoring_ORFs[genome_id][local_id][-1] = G.nodes[node]["description"] - - for edge in G.edges(): G.edges[edge[0], edge[1]]['genomeIDs'] = ";".join( [str(m) for m in G.edges[edge[0], edge[1]]['members']]) @@ -366,34 +359,23 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file # make sure trailing forward slash is present objects_dir = os.path.join(objects_dir, "") - to_remove = defaultdict(set) - - # create index of all high_scoring_ORFs node_IDs, remove if not in panaroo graph + # create index of all high_scoring_ORFs node_IDs node_index = defaultdict(list) for colour_ID, file_path in ORF_file_paths.items(): ORF_map = ggCaller_cpp.read_ORF_file(file_path) for ORF_ID, ORF_info in ORF_map.items(): - if not isinstance(ORF_info, list): - to_remove[colour_ID].add(ORF_ID) - continue delim = "_0_" if ORF_ID > 0 else "_refound_" entry_ID = str(colour_ID) + delim + str(ORF_ID) for node in ORF_info[0]: node_index[abs(node)].append(entry_ID) - # remove genes not in panaroo graph - for colour_ID, file_path in ORF_file_paths.items(): - ORF_map = ggCaller_cpp.read_ORF_file(file_path) - - for ORF_ID in to_remove[colour_ID]: - del ORF_map[ORF_ID] - - ggCaller_cpp.save_ORF_file(file_path, ORF_map) - with open(objects_dir + "node_index.dat", "wb") as o: cPickle.dump(node_index, o) - + + # map each ORF to node in graph + with open(objects_dir + "ORF_to_node_map.dat", "wb") as o: + cPickle.dump(ids_len_stop, o) return diff --git a/panaroo_runner/generate_output.py b/panaroo_runner/generate_output.py index 50c96df..57f4139 100644 --- a/panaroo_runner/generate_output.py +++ b/panaroo_runner/generate_output.py @@ -273,10 +273,15 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca description="")) else: for colour, file_path in ORF_file_paths.items(): + to_remove = set() ORF_map = ggCaller_cpp.read_ORF_file(file_path) for ORF_ID, ORF_info in ORF_map.items(): pan_ORF_id = str(colour) + "_0_" + str(ORF_ID) + # if pan_ORF_id not in ORF_map means has been removed so remove it from ORF_map + if pan_ORF_id not in ids_len_stop: + to_remove.add(ORF_ID) + continue node = ids_len_stop[pan_ORF_id][2] node_annotation = G.nodes[node]["description"] length_centroid = G.nodes[node]['lengths'][G.nodes[node]['maxLenId']] @@ -299,6 +304,11 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca aa_records.append( SeqRecord(Seq(gene).translate(), id=isolate_names[colour] + "_" + str(ORF_ID), description="ggcID=" + pan_ORF_id + ";" + gene_annotation)) + + # remove from ORF_map and save + for ORF_ID in to_remove: + del ORF_map[ORF_ID] + ggCaller_cpp.save_ORF_file(file_path, ORF_map) SeqIO.write(DNA_records, outfile + ".ffn", "fasta") SeqIO.write(aa_records, outfile + ".faa", "fasta") return