From 8ea266b37dad2cfbedf149eca8e077947e6b5971 Mon Sep 17 00:00:00 2001
From: Sam Horsfield <s.horsfield19@imperial.ac.uk>
Date: Wed, 9 Oct 2024 11:51:45 +0100
Subject: [PATCH] Completes removal of high scoring ORF dict from panaroo
 runner

---
 ggCaller/__main__.py              | 15 ++++++++-------
 ggCaller/graph_traversal.py       |  1 +
 panaroo_runner/__main__.py        | 30 ++++++------------------------
 panaroo_runner/generate_output.py | 10 ++++++++++
 4 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/ggCaller/__main__.py b/ggCaller/__main__.py
index 4d4d3ef..f475de2 100644
--- a/ggCaller/__main__.py
+++ b/ggCaller/__main__.py
@@ -530,7 +530,7 @@ def main():
                             options.truncation_threshold, options.save, options.refind)
 
     else:
-        print_ORF_calls(high_scoring_ORFs, os.path.join(output_dir, "gene_calls"),
+        print_ORF_calls(ORF_file_paths, os.path.join(output_dir, "gene_calls"),
                         input_colours, overlap, graph)
 
         if options.save:
@@ -544,16 +544,17 @@ def main():
 
             # serialise graph object and high scoring ORFs to future reading
             graph[0].data_out(objects_dir + "ggc_graph.dat")
-            with open(objects_dir + "high_scoring_orfs.dat", "wb") as o:
-                cPickle.dump(high_scoring_ORFs, o)
 
             # create index of all high_scoring_ORFs node_IDs
             node_index = defaultdict(list)
-            for colour, gene_dict in high_scoring_ORFs.items():
-                for ORF_ID, ORF_info in gene_dict.items():
-                    entry_ID = str(colour) + "_" + str(ORF_ID)
+            for colour_ID, file_path in ORF_file_paths.items():
+                ORF_map = ggCaller_cpp.read_ORF_file(file_path)
+
+                for ORF_ID, ORF_info in ORF_map.items():
+                    delim = "_0_" if ORF_ID > 0 else "_refound_"
+                    entry_ID = str(colour_ID) + delim + str(ORF_ID)
                     for node in ORF_info[0]:
-                        node_index[node].append(entry_ID)
+                        node_index[abs(node)].append(entry_ID)
 
             with open(objects_dir + "node_index.dat", "wb") as o:
                 cPickle.dump(node_index, o)
diff --git a/ggCaller/graph_traversal.py b/ggCaller/graph_traversal.py
index 67f3e7e..1b1bdf7 100644
--- a/ggCaller/graph_traversal.py
+++ b/ggCaller/graph_traversal.py
@@ -2,6 +2,7 @@
 import _pickle as cPickle
 from Bio import SeqIO
 
+# TODO update this to work with ggCaller ORF map outputs and panaroo graph which contains annotation
 def search_graph(graph, graphfile, coloursfile, queryfile, objects_dir, output_dir, query_id, num_threads):
     # check if objects_dir present, if not exit
     if not os.path.exists(objects_dir):
diff --git a/panaroo_runner/__main__.py b/panaroo_runner/__main__.py
index 06de1af..f9f3b8a 100644
--- a/panaroo_runner/__main__.py
+++ b/panaroo_runner/__main__.py
@@ -146,7 +146,7 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
         G = find_missing(G,
                          ORF_file_paths,
                          shd_arr_tup,
-                         kmer=kmer,ƒ
+                         kmer=kmer,
                          repeat=repeat,
                          overlap=overlap,
                          isolate_names=input_colours,
@@ -339,13 +339,6 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
             G.nodes[node]['dna'] = ";".join(conv_list(G.nodes[node]['dna']))
             G.nodes[node]['protein'] = ";".join(conv_list(G.nodes[node]['protein']))
 
-        # TODO sort out saving object, don't necessarily need to anymore as they are saved by default
-        # # add node annotation
-        # if save_objects:
-        #     high_scoring_ORFs[genome_id][local_id] = list(high_scoring_ORFs[genome_id][local_id])
-        #     high_scoring_ORFs[genome_id][local_id][-1] = G.nodes[node]["description"]
-
-
     for edge in G.edges():
         G.edges[edge[0], edge[1]]['genomeIDs'] = ";".join(
             [str(m) for m in G.edges[edge[0], edge[1]]['members']])
@@ -366,34 +359,23 @@ def run_panaroo(pool, shd_arr_tup, ORF_file_paths, Edge_file_paths, cluster_file
         # make sure trailing forward slash is present
         objects_dir = os.path.join(objects_dir, "")
 
-        to_remove = defaultdict(set)
-
-        # create index of all high_scoring_ORFs node_IDs, remove if not in panaroo graph
+        # create index of all high_scoring_ORFs node_IDs
         node_index = defaultdict(list)
         for colour_ID, file_path in ORF_file_paths.items():
             ORF_map = ggCaller_cpp.read_ORF_file(file_path)
 
             for ORF_ID, ORF_info in ORF_map.items():
-                if not isinstance(ORF_info, list):
-                    to_remove[colour_ID].add(ORF_ID)
-                    continue
                 delim = "_0_" if ORF_ID > 0 else "_refound_"
                 entry_ID = str(colour_ID) + delim + str(ORF_ID)
                 for node in ORF_info[0]:
                     node_index[abs(node)].append(entry_ID)
 
-        # remove genes not in panaroo graph
-        for colour_ID, file_path in ORF_file_paths.items():
-            ORF_map = ggCaller_cpp.read_ORF_file(file_path)
-
-            for ORF_ID in to_remove[colour_ID]:
-                del ORF_map[ORF_ID]
-            
-            ggCaller_cpp.save_ORF_file(file_path, ORF_map)
-
         with open(objects_dir + "node_index.dat", "wb") as o:
             cPickle.dump(node_index, o)
-
+        
+        # map each ORF to node in graph
+        with open(objects_dir + "ORF_to_node_map.dat", "wb") as o:
+            cPickle.dump(ids_len_stop, o)
 
     return
 
diff --git a/panaroo_runner/generate_output.py b/panaroo_runner/generate_output.py
index 50c96df..57f4139 100644
--- a/panaroo_runner/generate_output.py
+++ b/panaroo_runner/generate_output.py
@@ -273,10 +273,15 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca
                                             description=""))
     else:
         for colour, file_path in ORF_file_paths.items():
+            to_remove = set()
             ORF_map = ggCaller_cpp.read_ORF_file(file_path)
 
             for ORF_ID, ORF_info in ORF_map.items():
                 pan_ORF_id = str(colour) + "_0_" + str(ORF_ID)
+                # if pan_ORF_id not in ORF_map means has been removed so remove it from ORF_map
+                if pan_ORF_id not in ids_len_stop:
+                    to_remove.add(ORF_ID)
+                    continue
                 node = ids_len_stop[pan_ORF_id][2]
                 node_annotation = G.nodes[node]["description"]
                 length_centroid = G.nodes[node]['lengths'][G.nodes[node]['maxLenId']]
@@ -299,6 +304,11 @@ def print_ORF_calls(ORF_file_paths, outfile, input_colours, overlap, DBG, trunca
                     aa_records.append(
                         SeqRecord(Seq(gene).translate(), id=isolate_names[colour] + "_" + str(ORF_ID),
                                         description="ggcID=" + pan_ORF_id + ";" + gene_annotation))
+            
+            # remove from ORF_map and save
+            for ORF_ID in to_remove:
+                del ORF_map[ORF_ID]
+            ggCaller_cpp.save_ORF_file(file_path, ORF_map)
     SeqIO.write(DNA_records, outfile + ".ffn", "fasta")
     SeqIO.write(aa_records, outfile + ".faa", "fasta")
     return